317 lines
9.7 KiB
C++
317 lines
9.7 KiB
C++
// Copyright 2019 Google LLC
|
||
//
|
||
// This source code is licensed under the BSD-style license found in the
|
||
// LICENSE file in the root directory of this source tree.
|
||
|
||
#include <pthread.h>
|
||
#include <sched.h>
|
||
#ifdef __ANDROID__
|
||
#include <malloc.h>
|
||
#endif
|
||
#if defined(__SSE__) || defined(__x86_64__)
|
||
#include <xmmintrin.h>
|
||
#endif
|
||
|
||
#include <cstdio>
|
||
#include <cstdlib>
|
||
#include <cstring>
|
||
|
||
#include <cpuinfo.h>
|
||
|
||
#include "bench/utils.h"
|
||
|
||
|
||
static void* wipe_buffer = nullptr;
|
||
static size_t wipe_buffer_size = 0;
|
||
|
||
static pthread_once_t wipe_buffer_guard = PTHREAD_ONCE_INIT;
|
||
|
||
static void InitWipeBuffer() {
|
||
// Default: the largest know cache size (128 MB Intel Crystalwell L4 cache).
|
||
wipe_buffer_size = 128 * 1024 * 1024;
|
||
if (cpuinfo_initialize()) {
|
||
wipe_buffer_size = benchmark::utils::GetMaxCacheSize();
|
||
}
|
||
#if defined(__ANDROID__)
|
||
// memalign is obsolete, but it is the only option on Android until API level 17.
|
||
wipe_buffer = memalign(128, wipe_buffer_size);
|
||
#else
|
||
(void) posix_memalign((void**) &wipe_buffer, 128, wipe_buffer_size);
|
||
#endif
|
||
if (wipe_buffer != nullptr) {
|
||
memset(wipe_buffer, 0xA5, wipe_buffer_size);
|
||
}
|
||
}
|
||
|
||
namespace benchmark {
|
||
namespace utils {
|
||
|
||
uint32_t PrefetchToL1(const void* ptr, size_t size) {
|
||
uint32_t step = 16;
|
||
if (cpuinfo_initialize()) {
|
||
step = cpuinfo_get_l1d_cache(0)->line_size;
|
||
}
|
||
const uint8_t* u8_ptr = static_cast<const uint8_t*>(ptr);
|
||
// Compute and return sum of data to prevent compiler from removing data reads.
|
||
uint32_t sum = 0;
|
||
while (size >= step) {
|
||
sum += uint32_t(*u8_ptr);
|
||
u8_ptr += step;
|
||
size -= step;
|
||
}
|
||
return sum;
|
||
}
|
||
|
||
uint32_t WipeCache() {
|
||
pthread_once(&wipe_buffer_guard, &InitWipeBuffer);
|
||
return PrefetchToL1(wipe_buffer, wipe_buffer_size);
|
||
}
|
||
|
||
void DisableDenormals() {
|
||
#if defined(__SSE__) || defined(__x86_64__)
|
||
_mm_setcsr(_mm_getcsr() | 0x8040);
|
||
#elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
|
||
uint32_t fpscr;
|
||
__asm__ __volatile__(
|
||
"VMRS %[fpscr], fpscr\n"
|
||
"ORR %[fpscr], #0x1000000\n"
|
||
"VMSR fpscr, %[fpscr]\n"
|
||
: [fpscr] "=r" (fpscr));
|
||
#elif defined(__aarch64__)
|
||
uint64_t fpcr;
|
||
__asm__ __volatile__(
|
||
"MRS %[fpcr], fpcr\n"
|
||
"ORR %w[fpcr], %w[fpcr], 0x1000000\n"
|
||
"ORR %w[fpcr], %w[fpcr], 0x80000\n"
|
||
"MSR fpcr, %[fpcr]\n"
|
||
: [fpcr] "=r" (fpcr));
|
||
#endif
|
||
}
|
||
|
||
// Return clockrate in Hz
|
||
uint64_t GetCurrentCpuFrequency() {
|
||
#ifdef __linux__
|
||
int freq = 0;
|
||
char cpuinfo_name[512];
|
||
int cpu = sched_getcpu();
|
||
snprintf(cpuinfo_name, sizeof(cpuinfo_name),
|
||
"/sys/devices/system/cpu/cpu%d/cpufreq/scaling_cur_freq", cpu);
|
||
|
||
FILE* f = fopen(cpuinfo_name, "r");
|
||
if (f) {
|
||
if (fscanf(f, "%d", &freq)) {
|
||
fclose(f);
|
||
return uint64_t(freq) * 1000;
|
||
}
|
||
fclose(f);
|
||
}
|
||
#endif // __linux__
|
||
return 0;
|
||
}
|
||
|
||
size_t GetMaxCacheSize() {
|
||
if (!cpuinfo_initialize()) {
|
||
#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
|
||
// DynamIQ max: 4 MB
|
||
return 4 * 1024 * 1024;
|
||
#else
|
||
// Intel eDRAM max: 128 MB
|
||
return 128 * 1024 * 1024;
|
||
#endif
|
||
}
|
||
const cpuinfo_processor* processor = cpuinfo_get_processor(0);
|
||
#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
|
||
// There is no precise way to detect cache size on ARM/ARM64, and cache size reported by cpuinfo
|
||
// may underestimate the actual cache size. Thus, we use microarchitecture-specific maximum.
|
||
switch (processor->core->uarch) {
|
||
case cpuinfo_uarch_xscale:
|
||
case cpuinfo_uarch_arm11:
|
||
case cpuinfo_uarch_scorpion:
|
||
case cpuinfo_uarch_krait:
|
||
case cpuinfo_uarch_kryo:
|
||
case cpuinfo_uarch_exynos_m1:
|
||
case cpuinfo_uarch_exynos_m2:
|
||
case cpuinfo_uarch_exynos_m3:
|
||
// cpuinfo-detected cache size always correct.
|
||
break;
|
||
case cpuinfo_uarch_cortex_a5:
|
||
// Max observed (NXP Vybrid SoC)
|
||
return 512 * 1024;
|
||
case cpuinfo_uarch_cortex_a7:
|
||
// Cortex-A7 MPCore Technical Reference Manual:
|
||
// 7.1. About the L2 Memory system
|
||
// The L2 memory system consists of an:
|
||
// - Optional tightly-coupled L2 cache that includes:
|
||
// - Configurable L2 cache size of 128KB, 256KB, 512KB, and 1MB.
|
||
return 1024 * 1024;
|
||
case cpuinfo_uarch_cortex_a8:
|
||
// Cortex-A8 Technical Reference Manual:
|
||
// 8.1. About the L2 memory system
|
||
// The key features of the L2 memory system include:
|
||
// - configurable cache size of 0KB, 128KB, 256KB, 512KB, and 1MB
|
||
return 1024 * 1024;
|
||
case cpuinfo_uarch_cortex_a9:
|
||
// Max observed (e.g. Exynos 4212)
|
||
return 1024 * 1024;
|
||
case cpuinfo_uarch_cortex_a12:
|
||
case cpuinfo_uarch_cortex_a17:
|
||
// ARM Cortex-A17 MPCore Processor Technical Reference Manual:
|
||
// 7.1. About the L2 Memory system
|
||
// The key features of the L2 memory system include:
|
||
// - An integrated L2 cache:
|
||
// - The cache size is implemented as either 256KB, 512KB, 1MB, 2MB, 4MB or 8MB.
|
||
return 8 * 1024 * 1024;
|
||
case cpuinfo_uarch_cortex_a15:
|
||
// ARM Cortex-A15 MPCore Processor Technical Reference Manual:
|
||
// 7.1. About the L2 memory system
|
||
// The features of the L2 memory system include:
|
||
// - Configurable L2 cache size of 512KB, 1MB, 2MB and 4MB.
|
||
return 4 * 1024 * 1024;
|
||
case cpuinfo_uarch_cortex_a35:
|
||
// ARM Cortex‑A35 Processor Technical Reference Manual:
|
||
// 7.1 About the L2 memory system
|
||
// L2 cache
|
||
// - Further features of the L2 cache are:
|
||
// - Configurable size of 128KB, 256KB, 512KB, and 1MB.
|
||
return 1024 * 1024;
|
||
case cpuinfo_uarch_cortex_a53:
|
||
// ARM Cortex-A53 MPCore Processor Technical Reference Manual:
|
||
// 7.1. About the L2 memory system
|
||
// The L2 memory system consists of an:
|
||
// - Optional tightly-coupled L2 cache that includes:
|
||
// - Configurable L2 cache size of 128KB, 256KB, 512KB, 1MB and 2MB.
|
||
return 2 * 1024 * 1024;
|
||
case cpuinfo_uarch_cortex_a57:
|
||
// ARM Cortex-A57 MPCore Processor Technical Reference Manual:
|
||
// 7.1 About the L2 memory system
|
||
// The features of the L2 memory system include:
|
||
// - Configurable L2 cache size of 512KB, 1MB, and 2MB.
|
||
return 2 * 1024 * 1024;
|
||
case cpuinfo_uarch_cortex_a72:
|
||
// ARM Cortex-A72 MPCore Processor Technical Reference Manual:
|
||
// 7.1 About the L2 memory system
|
||
// The features of the L2 memory system include:
|
||
// - Configurable L2 cache size of 512KB, 1MB, 2MB and 4MB.
|
||
return 4 * 1024 * 1024;
|
||
case cpuinfo_uarch_cortex_a73:
|
||
// ARM Cortex‑A73 MPCore Processor Technical Reference Manual
|
||
// 7.1 About the L2 memory system
|
||
// The L2 memory system consists of:
|
||
// - A tightly-integrated L2 cache with:
|
||
// - A configurable size of 256KB, 512KB, 1MB, 2MB, 4MB, or 8MB.
|
||
return 8 * 1024 * 1024;
|
||
default:
|
||
// ARM DynamIQ Shared Unit Technical Reference Manual
|
||
// 1.3 Implementation options
|
||
// L3_CACHE_SIZE
|
||
// - 256KB
|
||
// - 512KB
|
||
// - 1024KB
|
||
// - 1536KB
|
||
// - 2048KB
|
||
// - 3072KB
|
||
// - 4096KB
|
||
return 4 * 1024 * 1024;
|
||
}
|
||
#endif
|
||
if (processor->cache.l4 != NULL) {
|
||
return processor->cache.l4->size;
|
||
} else if (processor->cache.l3 != NULL) {
|
||
return processor->cache.l3->size;
|
||
} else if (processor->cache.l2 != NULL) {
|
||
return processor->cache.l2->size;
|
||
} else if (processor->cache.l1d != NULL) {
|
||
return processor->cache.l1d->size;
|
||
} else {
|
||
return 0;
|
||
}
|
||
}
|
||
|
||
void MultiThreadingParameters(benchmark::internal::Benchmark* benchmark) {
|
||
benchmark->ArgName("T");
|
||
|
||
// Disabled thread pool (execution on the caller thread only).
|
||
benchmark->Arg(1);
|
||
|
||
if (cpuinfo_initialize()) {
|
||
// All cores except the little ones.
|
||
uint32_t max_cores = cpuinfo_get_cores_count();
|
||
if (cpuinfo_get_clusters_count() > 1) {
|
||
max_cores -= cpuinfo_get_cluster(cpuinfo_get_clusters_count() - 1)->core_count;
|
||
}
|
||
for (uint32_t t = 2; t <= max_cores; t++) {
|
||
benchmark->Arg(t);
|
||
}
|
||
|
||
// All cores (if more than one cluster).
|
||
if (cpuinfo_get_cores_count() > max_cores) {
|
||
benchmark->Arg(cpuinfo_get_cores_count());
|
||
}
|
||
|
||
// All cores + hyperthreads (only if hyperthreading supported).
|
||
if (cpuinfo_get_processors_count() > cpuinfo_get_cores_count()) {
|
||
benchmark->Arg(cpuinfo_get_processors_count());
|
||
}
|
||
}
|
||
}
|
||
|
||
|
||
bool CheckNEON(benchmark::State& state) {
|
||
if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon()) {
|
||
state.SkipWithError("no NEON extension");
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
bool CheckNEONFMA(benchmark::State& state) {
|
||
if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fma()) {
|
||
state.SkipWithError("no NEON-FMA extension");
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
bool CheckSSE41(benchmark::State& state) {
|
||
if (!cpuinfo_initialize() || !cpuinfo_has_x86_sse4_1()) {
|
||
state.SkipWithError("no SSE4.1 extension");
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
bool CheckAVX(benchmark::State& state) {
|
||
if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx()) {
|
||
state.SkipWithError("no AVX extension");
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
bool CheckFMA3(benchmark::State& state) {
|
||
if (!cpuinfo_initialize() || !cpuinfo_has_x86_fma3()) {
|
||
state.SkipWithError("no FMA3 extension");
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
bool CheckAVX2(benchmark::State& state) {
|
||
if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx2()) {
|
||
state.SkipWithError("no AVX2 extension");
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
bool CheckAVX512F(benchmark::State& state) {
|
||
if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f()) {
|
||
state.SkipWithError("no AVX512F extension");
|
||
return false;
|
||
}
|
||
return true;
|
||
}
|
||
|
||
} // namespace utils
|
||
} // namespace benchmark
|