Files
Android11/external/XNNPACK/bench/utils.cc
2023-10-13 14:01:41 +00:00

317 lines
9.7 KiB
C++
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// Copyright 2019 Google LLC
//
// This source code is licensed under the BSD-style license found in the
// LICENSE file in the root directory of this source tree.
#include <pthread.h>
#include <sched.h>
#ifdef __ANDROID__
#include <malloc.h>
#endif
#if defined(__SSE__) || defined(__x86_64__)
#include <xmmintrin.h>
#endif
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cpuinfo.h>
#include "bench/utils.h"
static void* wipe_buffer = nullptr;
static size_t wipe_buffer_size = 0;
static pthread_once_t wipe_buffer_guard = PTHREAD_ONCE_INIT;
static void InitWipeBuffer() {
// Default: the largest know cache size (128 MB Intel Crystalwell L4 cache).
wipe_buffer_size = 128 * 1024 * 1024;
if (cpuinfo_initialize()) {
wipe_buffer_size = benchmark::utils::GetMaxCacheSize();
}
#if defined(__ANDROID__)
// memalign is obsolete, but it is the only option on Android until API level 17.
wipe_buffer = memalign(128, wipe_buffer_size);
#else
(void) posix_memalign((void**) &wipe_buffer, 128, wipe_buffer_size);
#endif
if (wipe_buffer != nullptr) {
memset(wipe_buffer, 0xA5, wipe_buffer_size);
}
}
namespace benchmark {
namespace utils {
uint32_t PrefetchToL1(const void* ptr, size_t size) {
uint32_t step = 16;
if (cpuinfo_initialize()) {
step = cpuinfo_get_l1d_cache(0)->line_size;
}
const uint8_t* u8_ptr = static_cast<const uint8_t*>(ptr);
// Compute and return sum of data to prevent compiler from removing data reads.
uint32_t sum = 0;
while (size >= step) {
sum += uint32_t(*u8_ptr);
u8_ptr += step;
size -= step;
}
return sum;
}
uint32_t WipeCache() {
pthread_once(&wipe_buffer_guard, &InitWipeBuffer);
return PrefetchToL1(wipe_buffer, wipe_buffer_size);
}
void DisableDenormals() {
#if defined(__SSE__) || defined(__x86_64__)
_mm_setcsr(_mm_getcsr() | 0x8040);
#elif defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
uint32_t fpscr;
__asm__ __volatile__(
"VMRS %[fpscr], fpscr\n"
"ORR %[fpscr], #0x1000000\n"
"VMSR fpscr, %[fpscr]\n"
: [fpscr] "=r" (fpscr));
#elif defined(__aarch64__)
uint64_t fpcr;
__asm__ __volatile__(
"MRS %[fpcr], fpcr\n"
"ORR %w[fpcr], %w[fpcr], 0x1000000\n"
"ORR %w[fpcr], %w[fpcr], 0x80000\n"
"MSR fpcr, %[fpcr]\n"
: [fpcr] "=r" (fpcr));
#endif
}
// Return clockrate in Hz
uint64_t GetCurrentCpuFrequency() {
#ifdef __linux__
int freq = 0;
char cpuinfo_name[512];
int cpu = sched_getcpu();
snprintf(cpuinfo_name, sizeof(cpuinfo_name),
"/sys/devices/system/cpu/cpu%d/cpufreq/scaling_cur_freq", cpu);
FILE* f = fopen(cpuinfo_name, "r");
if (f) {
if (fscanf(f, "%d", &freq)) {
fclose(f);
return uint64_t(freq) * 1000;
}
fclose(f);
}
#endif // __linux__
return 0;
}
size_t GetMaxCacheSize() {
if (!cpuinfo_initialize()) {
#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
// DynamIQ max: 4 MB
return 4 * 1024 * 1024;
#else
// Intel eDRAM max: 128 MB
return 128 * 1024 * 1024;
#endif
}
const cpuinfo_processor* processor = cpuinfo_get_processor(0);
#if CPUINFO_ARCH_ARM || CPUINFO_ARCH_ARM64
// There is no precise way to detect cache size on ARM/ARM64, and cache size reported by cpuinfo
// may underestimate the actual cache size. Thus, we use microarchitecture-specific maximum.
switch (processor->core->uarch) {
case cpuinfo_uarch_xscale:
case cpuinfo_uarch_arm11:
case cpuinfo_uarch_scorpion:
case cpuinfo_uarch_krait:
case cpuinfo_uarch_kryo:
case cpuinfo_uarch_exynos_m1:
case cpuinfo_uarch_exynos_m2:
case cpuinfo_uarch_exynos_m3:
// cpuinfo-detected cache size always correct.
break;
case cpuinfo_uarch_cortex_a5:
// Max observed (NXP Vybrid SoC)
return 512 * 1024;
case cpuinfo_uarch_cortex_a7:
// Cortex-A7 MPCore Technical Reference Manual:
// 7.1. About the L2 Memory system
// The L2 memory system consists of an:
// - Optional tightly-coupled L2 cache that includes:
// - Configurable L2 cache size of 128KB, 256KB, 512KB, and 1MB.
return 1024 * 1024;
case cpuinfo_uarch_cortex_a8:
// Cortex-A8 Technical Reference Manual:
// 8.1. About the L2 memory system
// The key features of the L2 memory system include:
// - configurable cache size of 0KB, 128KB, 256KB, 512KB, and 1MB
return 1024 * 1024;
case cpuinfo_uarch_cortex_a9:
// Max observed (e.g. Exynos 4212)
return 1024 * 1024;
case cpuinfo_uarch_cortex_a12:
case cpuinfo_uarch_cortex_a17:
// ARM Cortex-A17 MPCore Processor Technical Reference Manual:
// 7.1. About the L2 Memory system
// The key features of the L2 memory system include:
// - An integrated L2 cache:
// - The cache size is implemented as either 256KB, 512KB, 1MB, 2MB, 4MB or 8MB.
return 8 * 1024 * 1024;
case cpuinfo_uarch_cortex_a15:
// ARM Cortex-A15 MPCore Processor Technical Reference Manual:
// 7.1. About the L2 memory system
// The features of the L2 memory system include:
// - Configurable L2 cache size of 512KB, 1MB, 2MB and 4MB.
return 4 * 1024 * 1024;
case cpuinfo_uarch_cortex_a35:
// ARM CortexA35 Processor Technical Reference Manual:
// 7.1 About the L2 memory system
// L2 cache
// - Further features of the L2 cache are:
// - Configurable size of 128KB, 256KB, 512KB, and 1MB.
return 1024 * 1024;
case cpuinfo_uarch_cortex_a53:
// ARM Cortex-A53 MPCore Processor Technical Reference Manual:
// 7.1. About the L2 memory system
// The L2 memory system consists of an:
// - Optional tightly-coupled L2 cache that includes:
// - Configurable L2 cache size of 128KB, 256KB, 512KB, 1MB and 2MB.
return 2 * 1024 * 1024;
case cpuinfo_uarch_cortex_a57:
// ARM Cortex-A57 MPCore Processor Technical Reference Manual:
// 7.1 About the L2 memory system
// The features of the L2 memory system include:
// - Configurable L2 cache size of 512KB, 1MB, and 2MB.
return 2 * 1024 * 1024;
case cpuinfo_uarch_cortex_a72:
// ARM Cortex-A72 MPCore Processor Technical Reference Manual:
// 7.1 About the L2 memory system
// The features of the L2 memory system include:
// - Configurable L2 cache size of 512KB, 1MB, 2MB and 4MB.
return 4 * 1024 * 1024;
case cpuinfo_uarch_cortex_a73:
// ARM CortexA73 MPCore Processor Technical Reference Manual
// 7.1 About the L2 memory system
// The L2 memory system consists of:
// - A tightly-integrated L2 cache with:
// - A configurable size of 256KB, 512KB, 1MB, 2MB, 4MB, or 8MB.
return 8 * 1024 * 1024;
default:
// ARM DynamIQ Shared Unit Technical Reference Manual
// 1.3 Implementation options
// L3_CACHE_SIZE
// - 256KB
// - 512KB
// - 1024KB
// - 1536KB
// - 2048KB
// - 3072KB
// - 4096KB
return 4 * 1024 * 1024;
}
#endif
if (processor->cache.l4 != NULL) {
return processor->cache.l4->size;
} else if (processor->cache.l3 != NULL) {
return processor->cache.l3->size;
} else if (processor->cache.l2 != NULL) {
return processor->cache.l2->size;
} else if (processor->cache.l1d != NULL) {
return processor->cache.l1d->size;
} else {
return 0;
}
}
void MultiThreadingParameters(benchmark::internal::Benchmark* benchmark) {
benchmark->ArgName("T");
// Disabled thread pool (execution on the caller thread only).
benchmark->Arg(1);
if (cpuinfo_initialize()) {
// All cores except the little ones.
uint32_t max_cores = cpuinfo_get_cores_count();
if (cpuinfo_get_clusters_count() > 1) {
max_cores -= cpuinfo_get_cluster(cpuinfo_get_clusters_count() - 1)->core_count;
}
for (uint32_t t = 2; t <= max_cores; t++) {
benchmark->Arg(t);
}
// All cores (if more than one cluster).
if (cpuinfo_get_cores_count() > max_cores) {
benchmark->Arg(cpuinfo_get_cores_count());
}
// All cores + hyperthreads (only if hyperthreading supported).
if (cpuinfo_get_processors_count() > cpuinfo_get_cores_count()) {
benchmark->Arg(cpuinfo_get_processors_count());
}
}
}
bool CheckNEON(benchmark::State& state) {
if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon()) {
state.SkipWithError("no NEON extension");
return false;
}
return true;
}
bool CheckNEONFMA(benchmark::State& state) {
if (!cpuinfo_initialize() || !cpuinfo_has_arm_neon_fma()) {
state.SkipWithError("no NEON-FMA extension");
return false;
}
return true;
}
bool CheckSSE41(benchmark::State& state) {
if (!cpuinfo_initialize() || !cpuinfo_has_x86_sse4_1()) {
state.SkipWithError("no SSE4.1 extension");
return false;
}
return true;
}
bool CheckAVX(benchmark::State& state) {
if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx()) {
state.SkipWithError("no AVX extension");
return false;
}
return true;
}
bool CheckFMA3(benchmark::State& state) {
if (!cpuinfo_initialize() || !cpuinfo_has_x86_fma3()) {
state.SkipWithError("no FMA3 extension");
return false;
}
return true;
}
bool CheckAVX2(benchmark::State& state) {
if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx2()) {
state.SkipWithError("no AVX2 extension");
return false;
}
return true;
}
bool CheckAVX512F(benchmark::State& state) {
if (!cpuinfo_initialize() || !cpuinfo_has_x86_avx512f()) {
state.SkipWithError("no AVX512F extension");
return false;
}
return true;
}
} // namespace utils
} // namespace benchmark