add cvimath

commit ce8705f49da5e5f59c2ddb3253ef88323a0cd9c4
Author: sophgo-forum-service <forum_service@sophgo.com>
Date:   Mon May 13 14:04:10 2024 +0800

    [feat] cvimath opensource for cv18xx soc.

    - 9e8967
This commit is contained in:
carbon
2024-05-31 11:54:07 +08:00
parent e25f20f7a3
commit 83dc4914fe
55 changed files with 18671 additions and 0 deletions

View File

@ -19,3 +19,4 @@
| cvibuilder | cvibuilder | https://github.com/sophgo/cvibuilder.git | sg200x-dev | 4309f2a |
| cvikernel | cvikernel | https://github.com/sophgo/cvikernel.git | sg200x-dev | 9f1f57a |
| cviruntime | cviruntime | https://github.com/sophgo/cviruntime.git | sg200x-dev | 3f49386 |
| cvimath | cvimath | https://github.com/sophgo/cvimath.git | sg200x-dev | ce8705f |

108
cvimath/.clang-format Normal file
View File

@ -0,0 +1,108 @@
---
Language: Cpp
# BasedOnStyle: Google
AccessModifierOffset: -1
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: false
AlignConsecutiveDeclarations: false
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: All
AllowShortIfStatementsOnASingleLine: true
AllowShortLoopsOnASingleLine: true
AlwaysBreakAfterDefinitionReturnType: None
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: true
AlwaysBreakTemplateDeclarations: true
BinPackArguments: true
BinPackParameters: true
BraceWrapping:
AfterClass: false
AfterControlStatement: false
AfterEnum: false
AfterFunction: false
AfterNamespace: false
AfterObjCDeclaration: false
AfterStruct: false
AfterUnion: false
BeforeCatch: false
BeforeElse: false
IndentBraces: false
SplitEmptyFunction: true
SplitEmptyRecord: true
SplitEmptyNamespace: true
BreakBeforeBinaryOperators: None
BreakBeforeBraces: Attach
BreakBeforeInheritanceComma: false
BreakBeforeTernaryOperators: true
BreakConstructorInitializersBeforeComma: false
BreakConstructorInitializers: BeforeColon
BreakAfterJavaFieldAnnotations: false
BreakStringLiterals: true
ColumnLimit: 100
CommentPragmas: '^ IWYU pragma:'
CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: true
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
DerivePointerAlignment: true
DisableFormat: false
ExperimentalAutoDetectBinPacking: false
FixNamespaceComments: true
ForEachMacros:
- foreach
- Q_FOREACH
- BOOST_FOREACH
IncludeCategories:
- Regex: '^<.*\.h>'
Priority: 1
- Regex: '^<.*'
Priority: 2
- Regex: '.*'
Priority: 3
IncludeIsMainRegex: '([-_](test|unittest))?$'
IndentCaseLabels: true
IndentWidth: 2
IndentWrappedFunctionNames: false
JavaScriptQuotes: Leave
JavaScriptWrapImports: true
KeepEmptyLinesAtTheStartOfBlocks: false
MacroBlockBegin: ''
MacroBlockEnd: ''
MaxEmptyLinesToKeep: 1
NamespaceIndentation: None
ObjCBlockIndentWidth: 2
ObjCSpaceAfterProperty: false
ObjCSpaceBeforeProtocolList: false
PenaltyBreakAssignment: 2
PenaltyBreakBeforeFirstCallParameter: 1
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 200
PointerAlignment: Left
ReflowComments: true
SortIncludes: true
SortUsingDeclarations: true
SpaceAfterCStyleCast: false
SpaceAfterTemplateKeyword: true
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: ControlStatements
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 2
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
Standard: Auto
TabWidth: 8
UseTab: Never
...

5
cvimath/.gitignore vendored Normal file
View File

@ -0,0 +1,5 @@
.vscode
build
install

85
cvimath/CMakeLists.txt Normal file
View File

@ -0,0 +1,85 @@
project(cvimath)
cmake_minimum_required(VERSION 3.2.2)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS ON)
#set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
#set(CMAKE_INSTALL_RPATH "\${ORIGIN}/../lib;\${ORIGIN}/")
if ("${CMAKE_BUILD_TYPE}" STREQUAL "")
set(CMAKE_BUILD_TYPE "Release")
endif()
if("${CMAKE_TOOLCHAIN_FILE}" STREQUAL "")
message("No toolchain file found. Using host compiler.")
if ("${CMAKE_INSTALL_PREFIX}" STREQUAL "/usr/local")
set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/install")
endif()
else()
if ("${CMAKE_INSTALL_PREFIX}" STREQUAL "/usr/local")
set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/install_soc")
endif()
endif()
set(CMAKE_C_INIT "-fsigned-char -fPIC -Werror=all -fdiagnostics-color=always")
set(CMAKE_CXX_INIT "-fsigned-char -fPIC -Werror=all -fdiagnostics-color=always -std=gnu++11")
if("${CMAKE_BUILD_TYPE}" STREQUAL "Release" OR "${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE")
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_INIT} -O3" )
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_INIT} -O3" )
elseif("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
set( SAFETY_FLAGS "-Werror -Wall -Wextra -ggdb -fno-strict-aliasing")
set( SAFETY_FLAGS "${SAFETY_FLAGS} -fsanitize=address")
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_INIT} -g -O0 ${SAFETY_FLAGS}")
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_INIT} -g -O0 ${SAFETY_FLAGS}" )
else()
message(FATAL_ERROR "No build type!!!")
endif()
message("==================================================")
message("[Summary]")
message("C compiler ${CMAKE_C_COMPILER}")
message("CXX compiler ${CMAKE_CXX_COMPILER}")
message("Build type ${CMAKE_BUILD_TYPE}")
message("Install dir ${CMAKE_INSTALL_PREFIX}")
message("==================================================")
# Add externel libs
set( TPU_LD "-L${TPU_SDK_ROOT}/lib")
set( TPU_KERNEL_LIB "${TPU_LD} -lcvikernel")
# wait cvimath/cviruntime so are generated
set( TEST_LIBS cvimath cviruntime)
# Add include path and set tpu libraries.
include_directories(
${TPU_SDK_ROOT}/include
${CVI_EXTRA}/include
"${CMAKE_CURRENT_SOURCE_DIR}/include")
# https://stackoverflow.com/questions/30250494/ctest-not-detecting-tests
enable_testing()
# ctest config
if (NOT CMAKE_CROSSCOMPILING)
if (ENABLE_TEST STREQUAL "ON")
add_subdirectory(tests)
endif()
endif()
add_subdirectory(src)
add_subdirectory(sample)
# export header
file(GLOB HEADERS
include/cvimath.h
include/cvimath_internal.h
include/test_cvikernel_util.h
)
# export sample
#file(GLOB SAMPLES sample/*)
#install(FILES ${SAMPLES} DESTINATION samples/cvimath)
install(FILES ${CMAKE_SOURCE_DIR}/toolchain/toolchain-aarch64-linux.cmake DESTINATION samples/cvimath)
install(FILES ${HEADERS} DESTINATION include/cvimath)

21
cvimath/README.md Normal file
View File

@ -0,0 +1,21 @@
# CviMath
## How to build
### Requirements
1. MLIR SDK
SOC mode
```
$ mkdir build
$ cd build
$ cmake -G Ninja .. -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DTOOLCHAIN_ROOT_DIR=${PWD}/../../gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu \
-DCMAKE_TOOLCHAIN_FILE=${PWD}/../toolchain/toolchain-aarch64-linux.cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX= \
-DTPU_SDK_ROOT=
$ ninja -j8 && ninja install
```

8
cvimath/clang-format.sh Executable file
View File

@ -0,0 +1,8 @@
#!/bin/bash
SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
CLANG_ROOT=$(readlink -f $SCRIPT_DIR)
find $CLANG_ROOT/include -regex '.*\.\(cpp\|h\|hpp\|cc\|c\|cxx\|inc\)' -exec clang-format -i {} \;
find $CLANG_ROOT/src -regex '.*\.\(cpp\|h\|hpp\|cc\|c\|cxx\|inc\)' -exec clang-format -i {} \;
find $CLANG_ROOT/tests -regex '.*\.\(cpp\|h\|hpp\|cc\|c\|cxx\|inc\)' -exec clang-format -i {} \;
find $CLANG_ROOT/sample -regex '.*\.\(cpp\|h\|hpp\|cc\|c\|cxx\|inc\)' -exec clang-format -i {} \;

84
cvimath/include/cvimath.h Normal file
View File

@ -0,0 +1,84 @@
#ifndef CVIMATH_H
#define CVIMATH_H
#include <stdint.h>
// public function
#ifdef __cplusplus
extern "C" {
#endif
/**
* @brief This function calculated the unit length of a precahed i8 feature array
*
* @param precached Prefetched feature array in 1-D. Format: feature1, feature2, ...
* @param unit_precached_arr Output unit length.
* @param data_length The length of the feature.
* @param data_num The number of features.
*/
void cvm_gen_precached_i8_unit_length(int8_t *precached, float *unit_precached_arr,
const uint32_t data_length, const uint32_t data_num);
/**
* @brief This function calculated the unit length of a precahed u8 feature array
*
* @param precached Prefetched feature array in 1-D. Format: feature1, feature2, ...
* @param unit_precached_arr Output unit length.
* @param data_length The length of the feature.
* @param data_num The number of features.
*/
void cvm_gen_precached_u8_unit_length(uint8_t *precached, float *unit_precached_arr,
const uint32_t data_length, const uint32_t data_num);
/**
* @brief Do inner product matching on i8 feature with given precached feature array.
*
* @param feature The input i8 feature to be compared.
* @param precached The precached feature array in 1-D.
* @param unit_precached_arr The unit length array of the precached.
* @param k_index The output matching index result in order.
* @param k_value The output matching value result in order.
* @param buffer The buffer used by this function, same length as precached.
* @param data_length The length of the single feature.
* @param data_num The number of features of the feature array.
* @param k Top k results, affects the length of k_index and k_value.
*/
void cvm_cpu_i8data_ip_match(int8_t *feature, int8_t *precached, float *unit_precached_arr,
uint32_t *k_index, float *k_value, float *buffer,
const uint32_t data_length, const uint32_t data_num, const uint32_t k);
/**
* @brief Do inner product matching on u8 feature with given precached feature array.
*
* @param feature The input u8 feature to be compared.
* @param precached The precached feature array in 1-D.
* @param unit_precached_arr The unit length array of the precached.
* @param k_index The output matching index result in order.
* @param k_value The output matching value result in order.
* @param buffer The buffer used by this function, same length as precached.
* @param data_length The length of the single feature.
* @param data_num The number of features of the feature array.
* @param k Top k results, affects the length of k_index and k_value.
*/
void cvm_cpu_u8data_ip_match(uint8_t *feature, uint8_t *precached, float *unit_precached_arr,
uint32_t *k_index, float *k_value, float *buffer,
const uint32_t data_length, const uint32_t data_num, const uint32_t k);
// Legacy support for hj.
inline void __attribute__((always_inline))
cvm_gen_db_i8_unit_length(int8_t *precached, float *unit_precached_arr, const uint32_t data_length,
const uint32_t data_num) {
cvm_gen_precached_i8_unit_length(precached, unit_precached_arr, data_length, data_num);
}
inline void __attribute__((always_inline))
cvm_gen_db_unit_length(uint8_t *precached, float *unit_precached_arr, const uint32_t data_length,
const uint32_t data_num) {
cvm_gen_precached_u8_unit_length(precached, unit_precached_arr, data_length, data_num);
}
#ifdef __cplusplus
}
#endif
#endif // CVIMATH_H

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,393 @@
#ifndef CVIMATH_TEST_UTIL_H
#define CVIMATH_TEST_UTIL_H
#include <cviruntime_context.h>
#include "cvikernel/cvikernel.h"
#include "bmruntime.h"
#include "bmruntime_bmkernel.h"
#include <assert.h>
#include <math.h> // pow
#include <stdint.h> // uint8_t / uint16_t
#include <stdio.h> /* printf, scanf, NULL */
#include <stdlib.h> /* malloc, free, rand */
#include <string.h> // strncpy
// copy from lagency
// TODO: move to properly header files
#define __ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))
#define ALIGN(x, a) __ALIGN_MASK(x, (__typeof__(x))(a)-1)
typedef uint32_t laddr_t;
typedef uint64_t gaddr_t;
typedef uint32_t ctrl_t;
#define CTRL_NULL 0
#define CTRL_AL (1 << 0) // alloc aligned with EU_NUM
#define CTRL_TP (1 << 5) // transpose
#define CTRL_NEURON (1 << 11) // mark neuron address in GDMA
#define LADDR_INVALID (0xFFFFFFFF)
#define GADDR_INVALID (0x000000FFFFFFFFFFULL)
static inline int ceiling_func(int numerator, int denominator) {
return (numerator + denominator - 1) / denominator;
}
static inline int ceiling_func_shift(int numerator, int shift) {
return (numerator + (1 << shift) - 1) >> shift;
}
static inline int get_num_shift(uint64_t num) {
int n = 0;
while (!(num & 1)) {
n++;
num >>= 1;
}
return n;
}
#ifdef __cplusplus
extern "C" {
#endif
/*
* bm runtime binds with bm kernel.
* cvi kernel still needs bm runtime.
*
* Need to create the separate function to combine bm runtime and cvi kernel.
* Function with postfix _comp (compatible) for such combination.
*/
#define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
/**
* @brief submit command buffer
*
* @param rt_ctx runtime structure
* @param cvk_ctx kernel structure
*/
void test_submit_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx);
/**
* @brief alloc tensor from device memory
*
* @param rt_ctx runtime structure
* @param cvk_ctx kernel structure
* @param shape tensor shape
* @param fmt tensor format such as \CVK_FMT_U16 or \CVK_FMT_U8
*
* @return cvk_tg_t structure
*/
cvk_tg_t *test_alloc_tg_mem_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx,
cvk_tg_shape_t shape, cvk_fmt_t fmt);
/**
* @brief alloc matrix from device memory
*
* @param rt_ctx runtime structure
* @param shape matrix shape
* @param fmt tensor format such as \CVK_FMT_U16 or \CVK_FMT_U8
*
* @return cvk_mg_t structure
*/
cvk_mg_t *test_alloc_mg_mem_comp(CVI_RT_HANDLE *rt_ctx, cvk_mg_shape_t shape, cvk_fmt_t fmt);
/**
* @brief free tensor from device memory
*
* @param rt_ctx runtime structure
* @param tg pointer of tg
*/
void test_free_tg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_tg_t *tg);
/**
* @brief free matrix from device memory
*
* @param rt_ctx runtime structure
* @param mg pointer of mg
*/
void test_free_mg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_mg_t *mg);
/**
* @brief put host data to alloced tensor device memory
*
* @param rt_ctx runtime structure
* @param tg pointer of tg
* @param data[] host data
*/
void test_put_tg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_tg_t *tg, uint8_t data[]);
/**
* @brief put host data to alloced matrix device memory
*
* @param rt_ctx runtime structure
* @param mg pointer of mg
* @param data[] host data
*/
void test_put_mg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_mg_t *mg, uint8_t data[]);
/**
* @brief syntactic sugar for \test_alloc_mg_mem_comp -> \test_put_mg_mem_comp
*
* @param rt_ctx runtime structure
* @param mg_data_format mg format such as \CVK_FMT_U16 or \CVK_FMT_U8
* @param data[] host data
*
* @return
*/
cvk_mg_t *test_put_matrix_g(CVI_RT_HANDLE *rt_ctx, const cvk_mg_shape_t shape,
cvk_fmt_t mg_data_format, uint8_t data[]);
/**
* @brief get tensor data from device memory
*
* @param rt_ctx runtime structure
* @param tg pointer of tg
*
* @return data in device memory
*/
uint8_t *test_get_tg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_tg_t *tg);
/**
* @brief get matrix data from device memory
*
* @param rt_ctx runtime structure
* @param mg pointer of mg
*
* @return data in device memory
*/
uint8_t *test_get_mg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_mg_t *mg);
/**
* @brief get tensor data from tpu memory,
* the data path should be tpu memory -> device memory -> host memory
*
* @param rt_ctx runtime structure
* @param cvk_ctx kernel structure
* @param tl pointer of tl
*
* @return data in tpu memory
*/
uint8_t *test_get_tensor_l2g_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx,
const cvk_tl_t *tl);
/**
* @brief get matrix data from tpu memory,
* the data path should be tpu memory -> device memory -> host memory
*
* @param rt_ctx runtime structure
* @param cvk_ctx kernel structure
* @param ml pointer of ml
*
* @return data in tpu memory
*/
uint8_t *test_get_matrix_l2g_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx,
const cvk_ml_t *ml);
/**
* @brief put host data to tpu memory with tensor
* the data path should be host memory -> device memory -> tpu memory
*
* @param rt_ctx runtime structure
* @param cvk_ctx kernel structure
* @param tl pointer of tl
* @param data[] data in host memory
*/
void test_put_tensor_g2l_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, const cvk_tl_t *tl,
uint8_t data[]);
/**
* @brief put host data to tpu memory with matrix
* the data path should be host memory -> device memory -> tpu memory
*
* @param rt_ctx runtime structure
* @param cvk_ctx kernel structure
* @param ml pointer of ml
* @param data[] data in host memory
*/
void test_put_matrix_g2l_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, const cvk_ml_t *ml,
uint8_t data[]);
/**
* @brief alloc tensor from tpu memory
*
* @param cvk_ctx kernel structure
* @param shape shape of tensor
* @param fmt tensor format such as \CVK_FMT_U16 or \CVK_FMT_U8
* @param eu_align is align excution unit
*
* @return pointer of tl
*/
cvk_tl_t *test_alloc_tl(cvk_context_t *cvk_ctx, cvk_tl_shape_t shape, cvk_fmt_t fmt, int eu_align);
/**
* @brief free tpu memory with tensor
*
* @param cvk_ctx kernel structure
* @param tl pointer of tl
*/
void test_free_tl(cvk_context_t *cvk_ctx, const cvk_tl_t *tl);
/**
* @brief a small structure for getting RT memory information
*/
typedef struct _AddrInfo
{
uint64_t phy_addr;
uint64_t size_bytes;
uint8_t *vir_addr;
int mem;
}AddrInfo;
/**
* @brief get tpu global memory and assign info to an structure
*
* @param[in] bm_ctx runtime structure
* @param[out] pAddrInfo a structure for physical, virtual address
*/
uint8_t *test_get_vp_addr(bmctx_t *ctx, AddrInfo *pAddrInfo);
/**
* @brief free tpu global memory from an info structure
*
* @param[in] bm_ctx runtime structure
* @param[in] pAddrInfo a structure for physical, virtual address
*/
void test_free_vp_addr(bmctx_t *ctx, AddrInfo *pAddrInfo);
/**
* @breif wrapper function
*/
// tensor in local functions
// get tl size
static inline uint64_t tl_shape_size(const cvk_tl_shape_t *s) {
return (uint64_t)s->n * s->c * s->h * s->w;
}
static inline uint64_t tg_shape_size(const cvk_tg_shape_t *s) {
return (uint64_t)s->n * s->c * s->h * s->w;
}
static inline uint64_t mg_shape_size(const cvk_mg_shape_t *s) { return (uint64_t)s->row * s->col; }
static inline void free_tl(cvk_context_t *cvk_ctx, const cvk_tl_t *t) {
return cvk_ctx->ops->lmem_free_tensor(cvk_ctx, t);
}
typedef struct {
cvk_fmt_t src_fmt;
cvk_fmt_t dst_fmt;
} cvk_fmt_type;
static inline int bitsize_of_fmt(cvk_fmt_t fmt) {
switch (fmt) {
case CVK_FMT_F32:
case CVK_FMT_I32:
return 32;
case CVK_FMT_F16:
case CVK_FMT_I16:
case CVK_FMT_U16:
case CVK_FMT_BF16:
return 16;
case CVK_FMT_I8:
case CVK_FMT_U8:
return 8;
case CVK_FMT_I4:
return 4;
case CVK_FMT_I2:
return 2;
case CVK_FMT_I1:
return 1;
default:
assert(0);
return -1;
}
}
static inline int bytesize_of_fmt(cvk_fmt_t fmt) { return bitsize_of_fmt(fmt) / 8; }
static inline void tg_2_tl_shape(cvk_tl_shape_t *tl, cvk_tg_shape_t *tg) {
tl->n = tg->n;
tl->c = tg->c;
tl->h = tg->h;
tl->w = tg->w;
}
static inline void tl_2_tg_shape(cvk_tg_shape_t *tg, cvk_tl_shape_t *tl) {
tg->n = tl->n;
tg->c = tl->c;
tg->h = tl->h;
tg->w = tl->w;
}
/**
* @brief init test case with runtime/kernel
*
* @param rt_ctx runtime structure
* @param cvk_ctx kernel structure
*/
// static inline void _test_init(CVI_RT_HANDLE ctx, cvk_context_t **cvk_ctx) {
// CVI_RT_HANDLE _ctx = (CVI_RT_HANDLE)ctx;
// int ret = CVI_RT_Init(&_ctx);
// if (ret != CVI_SUCCESS) {
// fprintf(stderr, "init failed, err %d\n", ret);
// exit(-1);
// }
//
// int alloc_size = 0x10000;
// *cvk_ctx = (cvk_context_t*) CVI_RT_RegisterKernel(_ctx, alloc_size);
// printf("alloc command buffer %d bytes success\n", alloc_size);
//}
// static inline void _test_exit(CVI_RT_HANDLE ctx, cvk_context_t *cvk_ctx) {
// CVI_RT_UnRegisterKernel(cvk_ctx);
// CVI_RT_HANDLE _ctx = (CVI_RT_HANDLE)ctx;
// CVI_RT_DeInit(_ctx);
//}
static inline void test_init(CVI_RT_HANDLE *ctx, cvk_context_t **cvk_ctx) {
CVI_RT_HANDLE *_ctx = (CVI_RT_HANDLE *)ctx;
int ret = CVI_RT_Init(_ctx);
if (ret != CVI_SUCCESS) {
fprintf(stderr, "init failed, err %d\n", ret);
exit(-1);
}
int alloc_size = 0x100000;
*cvk_ctx = (cvk_context_t *)CVI_RT_RegisterKernel(*_ctx, alloc_size);
printf("alloc command buffer %d bytes success\n", alloc_size);
}
/**
* @brief de-init with runtime/kernel
*
* @param rt_ctx runtime structure
* @param cvk_ctx kernel structure
*/
static inline void test_exit(CVI_RT_HANDLE *ctx, cvk_context_t *cvk_ctx) {
CVI_RT_UnRegisterKernel(cvk_ctx);
CVI_RT_HANDLE *_ctx = (CVI_RT_HANDLE *)ctx;
CVI_RT_DeInit(*_ctx);
}
// converter bf16<->int8
uint8_t convert_bf16_u8(uint16_t data);
int8_t convert_bf16_s8(uint16_t data);
uint16_t convert_int8_bf16(uint8_t data, uint8_t sign);
uint32_t convert_fp32_u32(float fp32);
float convert_hex_fp32(uint32_t hval);
uint32_t convert_fp32_hex(float val);
float convert_bf16_fp32(uint16_t bf16);
uint16_t convert_fp32_bf16(float fp32);
int set_store_feround();
void restore_feround(int round_mode);
static inline void *xmalloc(size_t size) {
void *p = malloc(size);
if (!p) {
return NULL;
}
return p;
}
#ifdef __cplusplus
}
#endif
#endif // CVIMATH_TEST_UTIL_H

View File

@ -0,0 +1,28 @@
project(cvimath_sample)
# wrapper source
# include header
include_directories(
${CMAKE_SOURCE_DIR}/include
${TPU_SDK_ROOT}/include
${TPU_SDK_ROOT}/include/cvimath
)
# add libs
set( TPU_KERNEL_LIB "-L${TPU_SDK_ROOT}/lib -lcvikernel")
set( TEST_LIBS cvimath cviruntime)
file(GLOB CVI1835_SAMPLE ./*.cpp)
foreach(SAMPLE_SRC ${CVI1835_SAMPLE})
get_filename_component(SAMPLE_NAME ${SAMPLE_SRC} NAME_WE)
add_executable(${SAMPLE_NAME} ${SAMPLE_UTIL} ${SAMPLE_SRC})
target_link_libraries(${SAMPLE_NAME} ${TPU_KERNEL_LIB} ${TEST_LIBS})
set_target_properties(${SAMPLE_NAME} PROPERTIES COMPILE_FLAGS "-Werror -Wall -Wextra")
install(TARGETS ${SAMPLE_NAME} DESTINATION bin)
add_test(${SAMPLE_NAME} ${SAMPLE_NAME} ctest_test)
endforeach()

21
cvimath/sample/README.md Normal file
View File

@ -0,0 +1,21 @@
# CVIMath
## How to build
### Requirements
1. MLIR SDK
SOC mode
```
$ mkdir build
$ cd build
$ cmake -G Ninja .. -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
-DTOOLCHAIN_ROOT_DIR=${PWD}/../../gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu \
-DCMAKE_TOOLCHAIN_FILE=${PWD}/../toolchain-aarch64-linux.cmake \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_INSTALL_PREFIX= \
-DTPU_SDK_ROOT=
$ ninja -j8 && ninja install
```

View File

@ -0,0 +1,130 @@
// \file mask sample for gt(great than), ge(great equal), eq(equal), lt(less than), le(less equal)
// header include
#include <assert.h>
#include <cvimath_internal.h> // math
#include <test_cvikernel_util.h> // kerenl
void init_input(uint16_t *input_data, uint64_t ifmap_size) {
for (uint64_t i = 0; i < ifmap_size; i++) {
input_data[i] = convert_fp32_bf16(i * 1.0);
}
}
void init_ref(uint16_t *input_data, uint32_t *ref_data, uint64_t ifmap_size) {
union s {
uint16_t int16[2]; // big endian
uint32_t int32;
};
union s _s;
for (uint64_t i = 0; i < ifmap_size; i++) {
_s.int16[0] = 0;
_s.int16[1] = input_data[i];
ref_data[i] = _s.int32;
}
}
static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx,
cvk_tg_shape_t *bf16_tg_shape) {
// for calculate size we need in host
cvk_tl_shape_t ifmap_shape = {bf16_tg_shape->n, bf16_tg_shape->c, bf16_tg_shape->h,
bf16_tg_shape->w};
// * 2 means fp32 takes twice size of bf16
cvk_tl_shape_t ofmap_shape = {bf16_tg_shape->n, bf16_tg_shape->c, bf16_tg_shape->h,
bf16_tg_shape->w * 2};
uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
// unit size is 1 bytes, bf16 takes 2 bytes
int data_type_size = 2;
uint64_t ifmap_bytesize = ifmap_size * data_type_size;
// * 2 means fp32 takes twice size of bf16
uint64_t ofmap_bytesize = ofmap_size * data_type_size * 2;
uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize);
uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize);
// init input / output data in ddr
init_input((uint16_t *)input_data, ifmap_size);
init_ref((uint16_t *)input_data, (uint32_t *)ref_data, ifmap_size);
// send host memory->device memory
cvk_fmt_t fmt = CVK_FMT_BF16;
cvk_tg_shape_t fp32_tg_shape;
fp32_tg_shape = {ofmap_shape.n, ofmap_shape.c, ofmap_shape.h, ofmap_shape.w};
cvk_tg_t *bf16_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, *bf16_tg_shape, fmt);
test_put_tg_mem_comp(rt_ctx, bf16_tg, (uint8_t *)input_data);
cvk_tg_t *fp32_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, fp32_tg_shape, fmt);
// prepare command buffer
cvm_bf16_fp32(cvk_ctx, bf16_tg, fp32_tg);
// submit descriptor
test_submit_comp(rt_ctx, cvk_ctx);
// get data from tl
uint8_t *ofmap_data = test_get_tg_mem_comp(rt_ctx, fp32_tg);
// compare with reference with byte
for (uint32_t i = 0; i < ofmap_size; i++) {
if (ref_data[i] != ofmap_data[i]) {
fprintf(stderr, "comparing failed output[%u] got %u, ref %u\n", i, ofmap_data[i],
ref_data[i]);
// fail case
exit(-1);
}
}
// free resource from tpu memory
test_free_tg_mem_comp(rt_ctx, bf16_tg);
test_free_tg_mem_comp(rt_ctx, fp32_tg);
// free resource from host memory
free(input_data);
free(ref_data);
free(ofmap_data);
}
int main() {
CVI_RT_HANDLE rt_ctx;
cvk_context_t *cvk_ctx;
int round_mode;
// align kerenl rounding mode
round_mode = set_store_feround();
// init runtime / kerenl structure
test_init(&rt_ctx, &cvk_ctx);
cvk_tg_shape_t bf16_tg_shape = {1, 2, 3, 4};
{
// test 1
printf("test bf16 <%d,%d,%d,%d> to fp32\n", bf16_tg_shape.n, bf16_tg_shape.c, bf16_tg_shape.h,
bf16_tg_shape.w);
testbench(&rt_ctx, cvk_ctx, &bf16_tg_shape);
printf("compare test bf16 to fp32 done\n");
}
{
// test 2
bf16_tg_shape = {1, 20, 30, 40};
printf("test bf16 <%d,%d,%d,%d> to fp32\n", bf16_tg_shape.n, bf16_tg_shape.c, bf16_tg_shape.h,
bf16_tg_shape.w);
testbench(&rt_ctx, cvk_ctx, &bf16_tg_shape);
printf("compare test bf16 to fp32 done\n");
}
// de-init runtime / kerenl structure
test_exit(&rt_ctx, cvk_ctx);
// restore rounding mode
restore_feround(round_mode);
return 0;
}

View File

@ -0,0 +1,109 @@
// \file mask sample for gt(great than), ge(great equal), eq(equal), lt(less than), le(less equal)
// header include
#include <assert.h>
#include <cvimath_internal.h> // math
#include <test_cvikernel_util.h> // kerenl
void init_input(uint32_t *input_data, uint64_t ifmap_size) {
for (uint64_t i = 0; i < ifmap_size; i++) {
input_data[i] = ((0x1234 + i) << 16) + 0x5678 + i;
}
}
static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx,
cvk_tg_shape_t *fp32_tg_shape) {
// for calculate size we need in host
cvk_tl_shape_t ifmap_shape = {fp32_tg_shape->n, fp32_tg_shape->c, fp32_tg_shape->h,
fp32_tg_shape->w};
uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
// unit size is 1 bytes, bf16 takes 2 bytes
int data_type_size = 2;
uint64_t ifmap_bytesize = ifmap_size * data_type_size;
uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize);
uint64_t ifmap_bytesize_per_fp32 = ifmap_bytesize / 4; // 4 means float takes 4 bytes
// init input / output data in ddr
init_input((uint32_t *)input_data, ifmap_bytesize_per_fp32);
// send host memory->device memory
cvk_fmt_t fmt = CVK_FMT_BF16;
cvk_tg_t *fp32_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, *fp32_tg_shape, fmt);
test_put_tg_mem_comp(rt_ctx, fp32_tg, (uint8_t *)input_data);
cvk_tg_shape_t bf16_tg_shape = *fp32_tg_shape;
cvk_tg_t *bf16_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, bf16_tg_shape, fmt);
// prepare command buffer
cvm_s2s_fp32_bf16(cvk_ctx, fp32_tg->start_address, fp32_tg->shape, bf16_tg->start_address,
bf16_tg->shape, fmt);
// submit descriptor
test_submit_comp(rt_ctx, cvk_ctx);
// get data from tl
uint8_t *ofmap_data = test_get_tg_mem_comp(rt_ctx, bf16_tg);
// compare with reference with byte
uint16_t *ofmap_data_bf16 = (uint16_t *)ofmap_data;
uint32_t *input_data_i32 = (uint32_t *)input_data;
for (uint32_t i = 0; i < ifmap_bytesize_per_fp32; i++) {
uint16_t _input_data_i16 = (input_data_i32[i] >> 16) & 0xffff;
if (_input_data_i16 != ofmap_data_bf16[i]) {
fprintf(stderr, "comparing failed output[%u] got %u, ref %u\n", i, ofmap_data_bf16[i],
_input_data_i16);
// fail case
exit(-1);
}
}
// free resource from tpu memory
test_free_tg_mem_comp(rt_ctx, bf16_tg);
test_free_tg_mem_comp(rt_ctx, fp32_tg);
// free resource from host memory
free(input_data);
free(ofmap_data);
}
int main() {
CVI_RT_HANDLE rt_ctx;
cvk_context_t *cvk_ctx;
int round_mode;
// align kerenl rounding mode
round_mode = set_store_feround();
// init runtime / kerenl structure
test_init(&rt_ctx, &cvk_ctx);
cvk_tg_shape_t fp32_tg_shape = {1, 2, 3, 4};
{
// test 1
printf("test fp32 <%d,%d,%d,%d> to bf16\n", fp32_tg_shape.n, fp32_tg_shape.c, fp32_tg_shape.h,
fp32_tg_shape.w);
testbench(&rt_ctx, cvk_ctx, &fp32_tg_shape);
printf("compare test bf16 to fp32 done\n");
}
{
// test 2
fp32_tg_shape = {1, 20, 30, 40};
printf("test fp32 <%d,%d,%d,%d> to bf16\n", fp32_tg_shape.n, fp32_tg_shape.c, fp32_tg_shape.h,
fp32_tg_shape.w);
testbench(&rt_ctx, cvk_ctx, &fp32_tg_shape);
printf("compare test bf16 to fp32 done\n");
}
// de-init runtime / kerenl structure
test_exit(&rt_ctx, cvk_ctx);
// restore rounding mode
restore_feround(round_mode);
return 0;
}

View File

@ -0,0 +1,312 @@
// \file sample for gemm(general matrix multiply)
// header include
#include <assert.h>
#include <cvimath_internal.h> // math
#include <test_cvikernel_util.h> // kerenl
#include <sys/time.h> // int gettimeofday
#include <time.h> /* clock_t, clock, CLOCKS_PER_SEC */
typedef cvk_tiu_matrix_multiplication_param_t param_t;
// comes from
// https://stackoverflow.com/questions/47023651/multiplying-matrices-in-one-dimensional-arrays
void multiply(uint16_t *a, int row1, int col1, uint16_t *b, int row2, int col2, uint16_t *d) {
assert(col1 == row2);
// silence error=unused-but-set-parameter warning
(void)row2;
for (int i = 0; i < row1; i++) {
for (int j = 0; j < col2; j++) {
float sum = 0;
for (int k = 0; k < col1; k++) {
float _a = convert_bf16_fp32(a[i * col1 + k]);
float _b = convert_bf16_fp32(b[k * col2 + j]);
sum = sum + _a * _b;
}
d[i * col2 + j] = convert_fp32_bf16(sum);
}
}
}
static void multiply_i32(uint8_t *a, int row1, int col1, uint8_t *b, int row2, int col2,
uint32_t *d, cvk_fmt_t fmt) {
assert(col1 == row2);
// silence error=unused-but-set-parameter warning
(void)row2;
for (int i = 0; i < row1; i++) {
for (int j = 0; j < col2; j++) {
int sum = 0;
for (int k = 0; k < col1; k++) {
int _a = fmt == CVK_FMT_I8 ? (int8_t)(a[i * col1 + k]) : (a[i * col1 + k]);
int _b = fmt == CVK_FMT_I8 ? (int8_t)(b[k * col2 + j]) : (b[k * col2 + j]);
sum = sum + _a * _b;
}
d[i * col2 + j] = (sum);
}
}
}
// compare with uint16_t type
int array_cmp_int16(const char *const info, const uint16_t *p_exp, const uint16_t *p_got,
int count) {
int idx;
for (idx = 0; idx < count; idx++) {
if (p_exp[idx] != p_got[idx]) {
printf("%s error at index %d exp %d(%f,0x%x) got %d(%f,0x%x)\n", info, idx, p_exp[idx],
convert_bf16_fp32(p_exp[idx]), p_exp[idx], p_got[idx], convert_bf16_fp32(p_got[idx]),
p_got[idx]);
return -1;
}
}
return 0;
}
static int array_cmp_int32(const char *const info, const uint32_t *p_exp, const uint32_t *p_got,
int count) {
int idx;
for (idx = 0; idx < count; idx++) {
if (p_exp[idx] != p_got[idx]) {
printf("%s error at index %d exp %d got %d\n", info, idx, p_exp[idx], p_got[idx]);
return -1;
}
}
return 0;
}
static cvk_mg_t *_test_put_matrix_g(CVI_RT_HANDLE *rt_ctx, size_t row, size_t col,
cvk_fmt_t mg_data_format, uint8_t data[]) {
cvk_mg_shape_t s;
s.row = row;
s.col = col;
return test_put_matrix_g(rt_ctx, s, mg_data_format, data);
}
static void assign_bf16_values_to_matrix(uint16_t *matrix, size_t size) {
float t;
for (size_t i = 0; i < size; i++) {
float f;
#if 1
// simple pattern
if (i % 2 == 0) t = i % 8;
if (i % 2 == 1) t = -1 * (i % 8);
f = t;
#else
t = i * (i % 2 ? -1 : 1);
f = t * 0.01 + size * 0.01;
#endif
matrix[i] = convert_fp32_bf16(f);
}
}
static void assign_i8_values_to_matrix(uint8_t *matrix, size_t size) {
for (size_t i = 0; i < size; i++) {
matrix[i] = i + 20;
}
}
static int test_gemm_bf16(size_t M, size_t N, size_t K) {
long elapsed;
struct timeval t0, t1;
int ret = 0;
// alloc test data in host
uint16_t *bf16_A = new uint16_t[M * K];
uint16_t *bf16_B = new uint16_t[N * K];
uint16_t *bf16_R = new uint16_t[2 * M * N];
uint16_t *int16_C_ref = new uint16_t[M * N];
// assign data
assign_bf16_values_to_matrix(bf16_A, M * K);
assign_bf16_values_to_matrix(bf16_B, N * K);
gettimeofday(&t0, NULL);
multiply(bf16_A, M, K, bf16_B, K, N, int16_C_ref);
gettimeofday(&t1, NULL);
elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
printf("CPU GEMM takes %ld us\n", elapsed);
CVI_RT_HANDLE ctx;
cvk_context_t *bk_ctx;
// init runtime / kerenl structure
test_init(&ctx, &bk_ctx);
// alloc device memory and put data to device
cvk_mg_t *mg_A = _test_put_matrix_g(&ctx, M, K, CVK_FMT_BF16, (uint8_t *)bf16_A);
cvk_mg_t *mg_B = _test_put_matrix_g(&ctx, K, N, CVK_FMT_BF16, (uint8_t *)bf16_B);
cvk_mg_t *mg_R = _test_put_matrix_g(&ctx, M * 2, N, CVK_FMT_BF16, (uint8_t *)bf16_R);
// get device address for gemm
gaddr_t gaddr_a = mg_A->start_address;
gaddr_t gaddr_b = mg_B->start_address;
gaddr_t gaddr_r = mg_R->start_address;
// prepare gemm descriptor
size_t *slice_num =
cvm_gemm((cvk_context_t *)bk_ctx, gaddr_a, gaddr_b, gaddr_r, M, K, N, CVK_FMT_BF16);
// submit descriptor
gettimeofday(&t0, NULL);
test_submit_comp(&ctx, bk_ctx);
gettimeofday(&t1, NULL);
elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
printf("GEMM takes %ld us\n", elapsed);
// get result from device to host
uint16_t *bf16_ref = (uint16_t *)test_get_mg_mem_comp(&ctx, mg_R);
// compare, exit once compare fail in
int cmp_res = array_cmp_int16("gemm", int16_C_ref, bf16_ref, M * N);
if (cmp_res != 0) {
ret = -1;
printf("Comparison failed for cblas_sgemm and bmblas_gemm!\n");
}
// free device resource
test_free_mg_mem_comp(&ctx, mg_A);
test_free_mg_mem_comp(&ctx, mg_B);
test_free_mg_mem_comp(&ctx, mg_R);
// de-init runtime / kerenl structure
test_exit(&ctx, bk_ctx);
// free resource from host
delete[] bf16_A;
delete[] bf16_B;
delete[] bf16_R;
delete[] int16_C_ref;
free(bf16_ref);
free(slice_num);
return ret;
}
static int test_gemm_i8(size_t M, size_t N, size_t K, cvk_fmt_t fmt) {
long elapsed;
struct timeval t0, t1;
int ret = 0;
// 4 means 32bit takes 4 times size of uint8_t
int uint32_per_uint8 = sizeof(uint32_t) / sizeof(uint8_t);
// alloc test data in host
uint8_t *i8_A = new uint8_t[M * K];
uint8_t *i8_B = new uint8_t[N * K];
uint8_t *i8_R = new uint8_t[uint32_per_uint8 * M * N];
uint32_t *int32_C_ref = new uint32_t[M * N];
// assign data
assign_i8_values_to_matrix(i8_A, M * K);
assign_i8_values_to_matrix(i8_B, N * K);
// measure cpu time
gettimeofday(&t0, NULL);
multiply_i32(i8_A, M, K, i8_B, K, N, int32_C_ref, fmt);
gettimeofday(&t1, NULL);
elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
printf("CPU GEMM takes %ld us\n", elapsed);
// alloc runtime
CVI_RT_HANDLE ctx;
cvk_context_t *bk_ctx;
// init runtime / kerenl structure
test_init(&ctx, &bk_ctx);
// alloc device memory and put data to device
cvk_mg_t *mg_A = _test_put_matrix_g(&ctx, M, K, CVK_FMT_I8, (uint8_t *)i8_A);
cvk_mg_t *mg_B = _test_put_matrix_g(&ctx, K, N, CVK_FMT_I8, (uint8_t *)i8_B);
cvk_mg_t *mg_R = _test_put_matrix_g(&ctx, M * uint32_per_uint8, N, CVK_FMT_I8, (uint8_t *)i8_R);
// get device address for gemm
gaddr_t gaddr_a = mg_A->start_address;
gaddr_t gaddr_b = mg_B->start_address;
gaddr_t gaddr_r = mg_R->start_address;
// prepare gemm descriptor
size_t *slice_num = cvm_gemm((cvk_context_t *)bk_ctx, gaddr_a, gaddr_b, gaddr_r, M, K, N, fmt);
gettimeofday(&t0, NULL);
// submit descriptor
test_submit_comp(&ctx, bk_ctx);
gettimeofday(&t1, NULL);
elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
printf("GEMM takes %ld us\n", elapsed);
// get result from device to host
uint8_t *i8_R_host = (uint8_t *)test_get_mg_mem_comp(&ctx, mg_R);
// for re-combine
uint32_t *i32_C = new uint32_t[M * N];
if (fmt == CVK_FMT_I8) {
cvm_combin_gemm_i8(slice_num, i8_R_host, i32_C, M, N);
}
free(slice_num);
// compare, exit once compare fail in
int cmp_res = array_cmp_int32("gemm", int32_C_ref, i32_C, M * N);
if (cmp_res != 0) {
ret = -1;
printf("Comparison failed for cblas_sgemm and bmblas_gemm!\n");
}
// free device resource
test_free_mg_mem_comp(&ctx, mg_A);
test_free_mg_mem_comp(&ctx, mg_B);
test_free_mg_mem_comp(&ctx, mg_R);
// de-init runtime / kerenl structure
test_exit(&ctx, bk_ctx);
// free resource from host
delete[] i8_A;
delete[] i8_B;
delete[] i8_R;
delete[] int32_C_ref;
delete[] i32_C;
free(i8_R_host);
return ret;
}
static int test_gemm(size_t M, size_t N, size_t K, cvk_fmt_t fmt) {
printf("%s: M=%zu, N=%zu, K=%zu\n", __func__, M, N, K);
if (fmt == CVK_FMT_BF16) {
return test_gemm_bf16(M, N, K);
} else {
return test_gemm_i8(M, N, K, fmt);
}
}
int main() {
int round_mode;
// align backend rounding
round_mode = set_store_feround();
if (0 != test_gemm(3, 500, 512, CVK_FMT_BF16)) exit(-1);
if (0 != test_gemm(1, 20000, 512, CVK_FMT_I8)) exit(-1);
// heavy test
// if (0 != test_gemm(300, 500, 512, CVK_FMT_BF16)) exit(-1);
printf("Comparison done for cpu gemm and tpu gemm!\n\n");
// restore rounding
restore_feround(round_mode);
return 0;
}

View File

@ -0,0 +1,175 @@
// \file mask sample for gt(great than), ge(great equal), eq(equal), lt(less than), le(less equal)
// header include
#include <assert.h>
#include <cvimath_internal.h> // math
#include <test_cvikernel_util.h> // kerenl
// global variable for loop all test case
static enum CVM_MASK_TYPE mode;
// global structure for test
struct pattern {
float *input; // input
float *ref; // reference output
int len; // data lenth
#define HELP_LEN (10)
char help[HELP_LEN]; // help message
};
// input
float cvm_mask_type_gt_0_input[] = {-1 * pow(2, -62), -0.003, -1.0, -100000, 0.000001, 1, 1000,
pow(2, 62), 0};
// ref, 0 means false, 1 means true
float cvm_mask_type_gt_0_output[] = {0, 0, 0, 0, 1, 1, 1, 1, 0};
float cvm_mask_type_ge_0_output[] = {0, 0, 0, 0, 1, 1, 1, 1, 1};
float cvm_mask_type_eq_0_output[] = {0, 0, 0, 0, 0, 0, 0, 0, 1};
float cvm_mask_type_lt_0_output[] = {1, 1, 1, 1, 0, 0, 0, 0, 0};
float cvm_mask_type_le_0_output[] = {1, 1, 1, 1, 0, 0, 0, 0, 1};
// size of input
int input_sz = sizeof(cvm_mask_type_gt_0_input) / sizeof(cvm_mask_type_gt_0_input[0]);
// init test case
static struct pattern patterns[] = {
{cvm_mask_type_gt_0_input, cvm_mask_type_gt_0_output, input_sz, "gt test"},
{cvm_mask_type_gt_0_input, cvm_mask_type_ge_0_output, input_sz, "ge test"},
{cvm_mask_type_gt_0_input, cvm_mask_type_eq_0_output, input_sz, "eq test"},
{cvm_mask_type_gt_0_input, cvm_mask_type_lt_0_output, input_sz, "lt test"},
{cvm_mask_type_gt_0_input, cvm_mask_type_le_0_output, input_sz, "le test"},
};
static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) {
// default test bf16 case
cvk_fmt_t fmt = CVK_FMT_BF16;
struct pattern *p = &patterns[mode];
// alloc shape, align with \len
uint32_t input_n = 1;
uint32_t input_c = 1;
uint32_t input_h = 1;
uint32_t input_w = p->len;
cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
cvk_tl_shape_t ofmap_shape = ifmap_shape;
uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
// unit size is 1 bytes, bf16 takes 2 bytes
int data_type_size = 1;
if (fmt == CVK_FMT_BF16) {
data_type_size = 2;
}
uint64_t ifmap_bytesize = ifmap_size * data_type_size;
uint64_t ofmap_bytesize = ofmap_size * data_type_size;
// get table shape
cvk_tl_shape_t table_shape;
uint64_t table_bytesize = cvm_lut_tbl_bytesize(bmk, &table_shape, fmt);
// alloc input/output tl
cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, CTRL_AL);
cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, CTRL_AL);
// alloc lookup table
cvk_tl_t *tl_pos_neg_buf = test_alloc_tl(bmk, table_shape, fmt, CTRL_AL);
cvk_tl_t *tl_0_idx_table = test_alloc_tl(bmk, table_shape, fmt, CTRL_AL);
// alloc tmp tl
cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, CTRL_AL);
cvk_tl_t *tl_buf2 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, CTRL_AL);
cvk_tl_t *tl_buf4 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, CTRL_AL);
// alloc data from ddr
uint16_t *input_data = (uint16_t *)xmalloc(ifmap_bytesize);
uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
uint16_t *table_data_atan_pos_neg = (uint16_t *)xmalloc(table_bytesize);
uint16_t *idx_0_table_data = (uint16_t *)xmalloc(table_bytesize);
// init lookup table data in ddr
cvm_gen_0_tbl(idx_0_table_data, &table_shape);
cvm_pos_neg_tbl(table_data_atan_pos_neg, &table_shape);
// init input / output data in ddr
for (uint32_t i = 0; i < ifmap_size; i++) {
input_data[i] = convert_fp32_bf16(p->input[i]);
ref_data[i] = convert_fp32_bf16(p->ref[i]);
}
// send ddr data to tl
test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)input_data);
test_put_tensor_g2l_comp(ctx, bmk, tl_pos_neg_buf, (uint8_t *)table_data_atan_pos_neg);
test_put_tensor_g2l_comp(ctx, bmk, tl_0_idx_table, (uint8_t *)idx_0_table_data);
// emit mask function
cvm_emit_mask(bmk,
tl_ifmap, // input
tl_buf, tl_buf2, tl_buf4, // tmp buffer
tl_pos_neg_buf, tl_0_idx_table, // lookup table
tl_ofmap_bf16, // output
fmt, mode);
// submit descriptor
test_submit_comp(ctx, bmk);
// get data from tl
uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, tl_ofmap_bf16);
// compare with reference
for (uint32_t i = 0; i < ifmap_size; i++) {
if (ref_data[i] != ofmap_data[i]) {
fprintf(stderr, "comparing failed at mode (%s) output[%u] got %f(0x%x), ref %f(0x%x)\n",
p->help, i, convert_bf16_fp32(ofmap_data[i]), ofmap_data[i],
convert_bf16_fp32(ref_data[i]), ref_data[i]);
// fail case
exit(-1);
}
}
// free resource from kernel
free_tl(bmk, tl_buf4);
free_tl(bmk, tl_buf2);
free_tl(bmk, tl_buf);
free_tl(bmk, tl_0_idx_table);
free_tl(bmk, tl_pos_neg_buf);
free_tl(bmk, tl_ofmap_bf16);
free_tl(bmk, tl_ifmap);
// free resource from heap
free(input_data);
free(ref_data);
free(ofmap_data);
free(table_data_atan_pos_neg);
free(idx_0_table_data);
}
int main() {
CVI_RT_HANDLE ctx;
cvk_context_t *bmk;
int round_mode;
// align kerenl rounding mode
round_mode = set_store_feround();
// init runtime / kerenl structure
test_init(&ctx, &bmk);
for (int i = CVM_MASK_TYPE_GT_0; i < CVM_MASK_MAX; i++) {
mode = static_cast<enum CVM_MASK_TYPE>(i);
struct pattern *p = &patterns[mode];
printf("test %s...\n", p->help);
testbench(&ctx, bmk);
}
// de-init runtime / kerenl structure
test_exit(&ctx, bmk);
// restore rounding mode
restore_feround(round_mode);
return 0;
}

View File

@ -0,0 +1,160 @@
// \file mask sample for gt(great than), ge(great equal), eq(equal), lt(less than), le(less equal)
// header include
#include <assert.h>
#include <cvimath_internal.h> // math
#include <test_cvikernel_util.h> // kerenl
void init_input(uint8_t *input_data, uint64_t ifmap_bytesize, cvk_fmt_t fmt) {
uint32_t fmt_size = cvm_bytesize_of_fmt(fmt);
uint64_t sz = ifmap_bytesize / fmt_size;
int round = 4; // random
for (uint64_t i = 0; i < sz; i++) {
uint8_t r[2];
r[0] = i % round;
if (r[0] == 0) {
r[0] = 1; // prevent mul to 0
}
if (fmt_size == 2) {
// bf16
uint16_t bf16 = convert_fp32_bf16((float)r[0]);
memcpy(r, &bf16, fmt_size);
}
memcpy(&input_data[i * fmt_size], r, fmt_size);
}
}
void init_ref(uint8_t *input_data, uint8_t *ref_data, cvk_tl_shape_t *ifmap_shape, cvk_fmt_t fmt) {
uint32_t fmt_size = cvm_bytesize_of_fmt(fmt);
int ref_idx = 0;
// reduce ONLY hw
for (uint32_t n = 0; n < ifmap_shape->n; n++) {
for (uint32_t c = 0; c < ifmap_shape->c; c++) {
float tmp = 1;
for (uint32_t h = 0; h < ifmap_shape->h; h++) {
for (uint32_t w = 0; w < ifmap_shape->w; w++) {
uint32_t off = (n * ifmap_shape->c * ifmap_shape->h * ifmap_shape->w +
c * ifmap_shape->h * ifmap_shape->w + h * ifmap_shape->w + w) *
fmt_size;
float v;
if (fmt_size == 2) {
// bf16 case
uint16_t bf16;
memcpy(&bf16, &input_data[off], fmt_size);
v = convert_bf16_fp32(bf16);
} else {
v = input_data[off];
}
tmp = v * tmp;
}
}
uint8_t r[2];
if (fmt_size == 2) {
// bf16 case
uint16_t bf16 = convert_fp32_bf16(tmp);
memcpy(r, (void *)&bf16, fmt_size);
} else {
r[0] = tmp;
}
memcpy(&ref_data[ref_idx * fmt_size], r, fmt_size);
ref_idx++;
}
}
}
static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, cvk_fmt_t fmt) {
// alloc shape, align with \len
uint32_t input_n = 1;
uint32_t input_c = 3;
uint32_t input_h = 2;
uint32_t input_w = 2;
cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
// NOTICE: ONLY reduce hw for performance
cvk_tl_shape_t ofmap_shape = {input_n, input_c, 1, 1};
uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
// unit size is 1 bytes, bf16 takes 2 bytes
int data_type_size = 1;
if (fmt == CVK_FMT_BF16) {
data_type_size = 2;
}
uint64_t ifmap_bytesize = ifmap_size * data_type_size;
uint64_t ofmap_bytesize = ofmap_size * data_type_size;
// alloc input/output tl
cvk_tl_t *tl_ifmap = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, CTRL_AL);
// alloc data from ddr
uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize);
uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize);
// init input / output data in ddr
init_input(input_data, ifmap_bytesize, fmt);
init_ref(input_data, ref_data, &ifmap_shape, fmt);
// send host memory->device memory->tpu_memory
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ifmap, (uint8_t *)input_data);
// prepare command buffer
cvm_reduce_hw_mul(cvk_ctx, tl_ifmap);
// submit descriptor
test_submit_comp(rt_ctx, cvk_ctx);
// reshape for reduce result
tl_ifmap->shape = {tl_ifmap->shape.n, tl_ifmap->shape.c, 1, 1};
tl_ifmap->stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_ifmap->shape, tl_ifmap->fmt, 1);
// get data from tl
uint8_t *ofmap_data = test_get_tensor_l2g_comp(rt_ctx, cvk_ctx, tl_ifmap);
// compare with reference with byte
for (uint32_t i = 0; i < ofmap_size; i++) {
if (ref_data[i] != ofmap_data[i]) {
fprintf(stderr, "comparing failed output[%u] got %u, ref %u\n", i, ofmap_data[i],
ref_data[i]);
// fail case
exit(-1);
}
}
// free resource from tpu memory
free_tl(cvk_ctx, tl_ifmap);
// free resource from host memory
free(input_data);
free(ref_data);
free(ofmap_data);
}
int main() {
CVI_RT_HANDLE rt_ctx;
cvk_context_t *cvk_ctx;
int round_mode;
// align kerenl rounding mode
round_mode = set_store_feround();
// init runtime / kerenl structure
test_init(&rt_ctx, &cvk_ctx);
printf("test reduce mul int8\n");
testbench(&rt_ctx, cvk_ctx, CVK_FMT_I8);
printf("test reduce mul bf16\n");
testbench(&rt_ctx, cvk_ctx, CVK_FMT_BF16);
// de-init runtime / kerenl structure
test_exit(&rt_ctx, cvk_ctx);
// restore rounding mode
restore_feround(round_mode);
return 0;
}

View File

@ -0,0 +1,656 @@
// \file sample for set value by mask, plz refer \cvimath_internal.h for more details
// header include
#include <assert.h>
#include <cvimath_internal.h> // math
#include <test_cvikernel_util.h> // kerenl
#include <sys/time.h> // int gettimeofday
#include <time.h> /* clock_t, clock, CLOCKS_PER_SEC */
#define DEBUG 1 // < 0 is disable debug
#define debug_print(fmt, ...) \
do { \
if (DEBUG) fprintf(stderr, fmt, __VA_ARGS__); \
} while (0)
int flip = 0;
struct testbench {
char *name;
int (*cvm_run)(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf,
cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias,
uint8_t threshold, uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap);
void (*ref)(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask, uint8_t *pNewY, uint8_t *pY,
uint8_t *g_update_tbl, uint8_t threshold, uint8_t w1, uint8_t w2);
uint8_t threshold;
uint8_t w1;
uint8_t w2;
};
static void init_kernel(uint8_t *kernel_data, uint64_t kernel_size, int8_t val) {
int8_t *kernel_data_i8 = (int8_t *)kernel_data;
for (uint64_t i = 0; i < kernel_size; i++) {
kernel_data_i8[i] = val;
}
}
static void init_bias(uint8_t *bias_data, uint64_t bias_size, int16_t val) {
int c = bias_size / 2;
for (int i = 0; i < c; i++) {
bias_data[i] = val & 0xff;
bias_data[i + c] = (val >> 8) & 0xff;
}
}
static void init_input_2(uint8_t *input_data, uint64_t ifmap_size) {
for (uint64_t i = 0; i < ifmap_size; i++) {
input_data[i] = i * 2 * (i % 3 ? -1 : 1);
}
}
static void init_input_3(uint8_t *input_data, uint64_t ifmap_size) {
for (uint64_t i = 0; i < ifmap_size; i++) {
input_data[i] = i * 3;
}
}
static void init_mask(uint8_t *mask, uint64_t ifmap_size) {
for (uint64_t i = 0; i < ifmap_size; i++) {
mask[i] = i % 2;
}
}
static void init_update_tbl(uint8_t *update_tbl, uint64_t ifmap_size) {
int8_t *update_tbl_i8 = (int8_t *)update_tbl;
for (uint64_t i = 0; i < ifmap_size; i++) {
update_tbl_i8[i] = i * (i % 2 ? -1 : 1);
}
}
static void init_ref(uint8_t *ref_data, uint64_t ofmap_size) {
for (uint64_t i = 0; i < ofmap_size; i++) {
ref_data[i] = -1 * i;
// ref_data[i] = 3 * i;
}
}
static void set_image_by_u8mask(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask,
uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl,
uint8_t threshold, uint8_t w1, uint8_t w2) {
(void)pY;
(void)g_update_tbl;
(void)threshold;
(void)w1;
(void)w2;
for (size_t i = 0; i < ifmap_size; i++) {
if (mask[i]) {
ref_data[i] = pNewY[i];
}
}
}
static void set_image_by_two_info_i8(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask,
uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl,
uint8_t threshold, uint8_t w1, uint8_t w2) {
(void)pY;
(void)w1;
(void)w2;
int8_t *g_update_tbl_i8 = (int8_t *)g_update_tbl;
for (size_t i = 0; i < ifmap_size; i++) {
if (mask[i] && (g_update_tbl_i8[i] < threshold)) {
ref_data[i] = pNewY[i];
}
}
}
static void gen_image_diff(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask, uint8_t *pNewY,
uint8_t *pY, uint8_t *g_update_tbl, uint8_t threshold, uint8_t w1,
uint8_t w2) {
(void)mask;
(void)w1;
(void)w2;
(void)g_update_tbl;
(void)threshold;
for (size_t i = 0; i < ifmap_size; i++) {
ref_data[i] = abs(pNewY[i] - pY[i]);
}
}
static void update_tbl_by_threshold(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask,
uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl,
uint8_t threshold, uint8_t w1, uint8_t w2) {
(void)pNewY;
(void)pY;
(void)g_update_tbl;
(void)mask;
(void)w2;
int8_t *ref_data_i8 = (int8_t *)ref_data; // output is i8
for (size_t i = 0; i < ifmap_size; i++) {
mask[i] = 0;
}
for (size_t i = 0; i < ifmap_size; i++) {
int8_t old = ref_data_i8[i];
if (g_update_tbl[i] < threshold) {
ref_data_i8[i] = (ref_data_i8[i] < w1) ? 0 : (ref_data_i8[i] - 1);
} else {
if (old != 127) {
// saturate it
ref_data_i8[i]++;
}
mask[i] = 1;
}
}
}
static void set_image_by_two_info_u8(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask,
uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl,
uint8_t threshold, uint8_t w1, uint8_t w2) {
(void)pY;
(void)mask;
(void)w1;
(void)w2;
// int8_t* g_update_tbl_i8 = (int8_t*)g_update_tbl;
for (size_t i = 0; i < ifmap_size; i++) {
if (g_update_tbl[i] >= threshold) {
ref_data[i] = pNewY[i];
}
}
}
static void blend_image_by_tbl(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask,
uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl,
uint8_t threshold, uint8_t w1, uint8_t w2) {
(void)mask;
(void)pY;
int8_t *g_update_tbl_i8 = (int8_t *)g_update_tbl;
for (size_t i = 0; i < ifmap_size; i++) {
if (g_update_tbl_i8[i] > threshold) {
ref_data[i] = (w1 * ref_data[i] + w2 * pNewY[i]) >> 8;
}
}
}
static int _cvm_set_image_by_u8mask(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, uint8_t threshold,
uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap) {
(void)tl_ifmap2;
(void)tl_update_tbl;
(void)threshold;
(void)w1;
(void)w2;
(void)tl_kernel;
(void)tl_bias;
(void)tl_buf;
return cvm_set_image_by_u8mask(ctx, tl_ifmap, tl_buf, tl_mask, tl_ofmap);
}
static int _cvm_set_image_by_u8mask_dp(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, uint8_t threshold,
uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap) {
(void)tl_ifmap2;
(void)tl_update_tbl;
(void)threshold;
(void)w1;
(void)w2;
(void)tl_kernel;
(void)tl_bias;
(void)tl_buf;
return cvm_set_image_by_u8mask_dp(ctx, tl_ifmap, tl_mask, tl_kernel, tl_bias, tl_ofmap);
}
static int _cvm_set_image_by_two_info_i8(cvk_context_t *ctx, cvk_tl_t *tl_ifmap,
cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf, cvk_tl_t *tl_mask,
cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_kernel,
cvk_tl_t *tl_bias, uint8_t threshold, uint8_t w1,
uint8_t w2, cvk_tl_t *tl_ofmap) {
(void)tl_ifmap2;
(void)threshold;
(void)w1;
(void)w2;
(void)tl_kernel;
(void)tl_bias;
// tl_ifmap2 as buf
return cvm_set_image_by_two_info_i8(ctx, tl_ifmap, tl_buf, tl_mask, tl_update_tbl, threshold,
tl_ofmap);
}
static int _cvm_set_image_by_two_info_i8_dp(cvk_context_t *ctx, cvk_tl_t *tl_ifmap,
cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf,
cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias,
uint8_t threshold, uint8_t w1, uint8_t w2,
cvk_tl_t *tl_ofmap) {
(void)tl_ifmap2;
(void)threshold;
(void)w1;
(void)w2;
(void)tl_kernel;
(void)threshold;
(void)tl_buf;
return cvm_set_image_by_two_info_i8_dp(ctx, tl_ifmap, tl_kernel, tl_mask, tl_update_tbl, tl_bias,
tl_ofmap);
}
static int _cvm_gen_image_diff(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, uint8_t threshold,
uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap) {
(void)tl_mask;
(void)tl_buf;
(void)tl_update_tbl;
(void)threshold;
(void)w1;
(void)w2;
(void)tl_kernel;
(void)tl_bias;
// tl_mask as buffer
return cvm_gen_image_diff(ctx, tl_ifmap, tl_ifmap2, tl_mask, tl_buf, tl_ofmap);
}
static int _cvm_update_tbl_by_threshold(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
cvk_tl_t *tl_buf, cvk_tl_t *tl_mask,
cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_kernel,
cvk_tl_t *tl_bias, uint8_t threshold, uint8_t w1,
uint8_t w2, cvk_tl_t *tl_ofmap) {
(void)w2;
(void)tl_kernel;
(void)tl_bias;
// w1 as threshold_b, tl_ifmap/tl_ifmap2 as buf
return cvm_update_tbl_by_threshold(ctx, tl_mask, tl_ifmap, tl_ifmap2, tl_buf, tl_update_tbl,
threshold, w1, tl_ofmap);
}
static int _cvm_set_image_by_two_info_u8(cvk_context_t *ctx, cvk_tl_t *tl_ifmap,
cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf, cvk_tl_t *tl_mask,
cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_kernel,
cvk_tl_t *tl_bias, uint8_t threshold, uint8_t w1,
uint8_t w2, cvk_tl_t *tl_ofmap) {
(void)tl_ifmap2;
(void)tl_mask;
(void)w1;
(void)w2;
(void)tl_kernel;
(void)tl_bias;
// tl_ifmap2 as buf
return cvm_set_image_by_two_info_u8(ctx, tl_ifmap, tl_ifmap2, tl_buf, tl_update_tbl, threshold,
tl_ofmap);
}
static int _cvm_blend_image_by_tbl(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, uint8_t threshold,
uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap) {
(void)tl_ifmap2;
(void)tl_kernel;
(void)tl_bias;
// tl_mask as buf
return cvm_blend_image_by_tbl(ctx, tl_ifmap, tl_mask, tl_buf, tl_update_tbl, threshold, w1, w2,
tl_ofmap);
}
struct testbench testbenchs[] = {
{(char *)"set_image_by_two_info_i8_dp", _cvm_set_image_by_two_info_i8_dp,
set_image_by_two_info_i8, 2, 2, 3},
{(char *)"set_image_by_u8mask_dp", _cvm_set_image_by_u8mask_dp, set_image_by_u8mask, 10, 2, 3},
{(char *)"set_image_by_u8mask", _cvm_set_image_by_u8mask, set_image_by_u8mask, 10, 2, 3},
{(char *)"set_image_by_two_info_i8", _cvm_set_image_by_two_info_i8, set_image_by_two_info_i8, 2,
2, 3},
{(char *)"update_tbl_by_threshold", _cvm_update_tbl_by_threshold, update_tbl_by_threshold, 15,
12, 3},
{(char *)"gen_image_diff", _cvm_gen_image_diff, gen_image_diff, 10, 2, 3},
{(char *)"set_image_by_two_info_u8", _cvm_set_image_by_two_info_u8, set_image_by_two_info_u8,
40, 2, 3},
{(char *)"blend_image_by_tbl", _cvm_blend_image_by_tbl, blend_image_by_tbl, 6, 2, 3},
};
static void load(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap2,
uint8_t *input_ifmap2, cvk_tl_t *tl_ifmap3, uint8_t *input_ifmap3,
cvk_tl_t *tl_ofmap, uint8_t *input_ofmap, cvk_tl_t *tl_mask, uint8_t *input_mask,
cvk_tl_t *tl_update_tbl, uint8_t *input_update_tbl) {
// send device memory to sram
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ifmap2, input_ifmap2);
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ifmap3, input_ifmap3);
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_mask, input_mask);
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_update_tbl, input_update_tbl);
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ofmap, input_ofmap);
}
static void store(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, char *name, cvk_tl_t *tl_ofmap,
uint8_t *output_ofmap, cvk_tl_t *tl_mask, uint8_t *output_mask, int sz) {
uint8_t *ofmap_data = (uint8_t *)test_get_tensor_l2g_comp(rt_ctx, cvk_ctx, tl_ofmap);
// NOTICE: heavy copy
memcpy(output_ofmap, ofmap_data, sz);
free(ofmap_data);
if (!strcmp(name, "update_tbl_by_threshold")) {
uint8_t *mask_data = (uint8_t *)test_get_tensor_l2g_comp(rt_ctx, cvk_ctx, tl_mask);
memcpy(output_mask, mask_data, sz);
free(mask_data);
}
}
static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, cvk_tg_shape_t *tg_shape,
int testcase_idx, int is_pingpong = false) {
// for calculate size we need in host
cvk_tl_shape_t ifmap_shape = {tg_shape->n, tg_shape->c, tg_shape->h, tg_shape->w};
cvk_tl_shape_t ofmap_shape = ifmap_shape;
uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
// unit size is 1 bytes
int data_type_size = 1;
// get input/output size
uint64_t ifmap_bytesize = ifmap_size * data_type_size;
uint64_t ofmap_bytesize = ofmap_size * data_type_size;
// alloc on ddr
// uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize);
uint8_t *input_data2 = (uint8_t *)xmalloc(ifmap_bytesize);
uint8_t *input_data3 = (uint8_t *)xmalloc(ifmap_bytesize);
uint8_t *mask = (uint8_t *)xmalloc(ifmap_bytesize);
uint8_t *update_tbl = (uint8_t *)xmalloc(ifmap_bytesize);
uint8_t *_ref_data = (uint8_t *)xmalloc(ofmap_bytesize);
uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize);
uint8_t *tpu_output_data = (uint8_t *)xmalloc(ofmap_bytesize);
uint8_t *tpu_output_mask = (uint8_t *)xmalloc(ofmap_bytesize);
// init input / output data in ddr
uint8_t threshold, w1, w2;
threshold = testbenchs[testcase_idx].threshold;
w1 = testbenchs[testcase_idx].w1;
w2 = testbenchs[testcase_idx].w2;
init_input_2(input_data2, ifmap_size);
init_input_3(input_data3, ifmap_size);
// init_input(input_data2, ifmap_size);
// init_input(input_data3, ifmap_size);
init_mask(mask, ifmap_size);
init_update_tbl(update_tbl, ifmap_size);
init_ref(ref_data, ofmap_size);
// keep org output
memcpy(_ref_data, ref_data, ofmap_bytesize);
testbenchs[testcase_idx].ref(ref_data, ofmap_size, mask, input_data2, input_data3, update_tbl,
threshold, w1, w2);
int tiles = std::ceil(ifmap_shape.c / (float)cvk_ctx->info.npu_num);
ifmap_shape.c = ifmap_shape.c / tiles;
cvk_tl_shape_t kernel_shape = ifmap_shape;
kernel_shape.h = 1;
kernel_shape.w = 1;
cvk_tl_shape_t bias_shape = ifmap_shape;
bias_shape.h = 1;
bias_shape.w = 1;
bias_shape.n = 2;
uint64_t kernel_size = tl_shape_size(&kernel_shape);
uint64_t bias_size = tl_shape_size(&bias_shape);
uint64_t kernel_bytesize = kernel_size * data_type_size;
uint64_t bias_bytesize = bias_size * data_type_size;
uint8_t *kernel_data = (uint8_t *)xmalloc(kernel_bytesize);
uint8_t *bias_data = (uint8_t *)xmalloc(bias_bytesize);
// NOTICE: must init with it
init_kernel(kernel_data, kernel_size, -1);
init_bias(bias_data, bias_size, 1);
if (!strcmp(testbenchs[testcase_idx].name, "set_image_by_two_info_i8_dp")) {
init_kernel(kernel_data, kernel_size, 1);
init_bias(bias_data, bias_size, -1 * threshold);
}
if (is_pingpong) {
// quirk that we tile h for easy implemenetation
ifmap_shape.h /= 2;
tiles *= 2;
}
// sync input/output
ofmap_shape = ifmap_shape;
// NOTICE: dont care batch
int shape_sz = ifmap_shape.c * ifmap_shape.h * ifmap_shape.w;
// alloc on sram, just once
cvk_fmt_t fmt = CVK_FMT_U8; // for mac used
int eu_align = 1; // dont care
cvk_tl_t *tl_ifmap2[2] = {NULL, NULL};
cvk_tl_t *tl_ifmap3[2] = {NULL, NULL};
cvk_tl_t *tl_ofmap[2] = {NULL, NULL};
cvk_tl_t *tl_mask[2] = {NULL, NULL};
cvk_tl_t *tl_update_tbl[2] = {NULL, NULL};
// must place last for high part of 'mac'
cvk_tl_t *tl_buf[2] = {NULL, NULL};
cvk_tl_t *tl_kernel, *tl_bias;
// alloc sram
tl_kernel = test_alloc_tl(cvk_ctx, kernel_shape, CVK_FMT_I8, eu_align);
tl_bias = test_alloc_tl(cvk_ctx, bias_shape, CVK_FMT_I8, /*eu_align=*/0);
int alloc_nr = is_pingpong ? 2 : 1;
for (int i = 0; i < alloc_nr; i++) {
tl_ifmap2[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
tl_ifmap3[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
tl_ofmap[i] = test_alloc_tl(cvk_ctx, ofmap_shape, fmt, eu_align);
tl_mask[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
tl_update_tbl[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
// must place last for high part of 'mac'
tl_buf[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
}
// NOTICE: consider residual
int load_offset = 0;
int store_offset = 0;
int ret;
int curr = flip;
long elapsed;
struct timeval t0, t1;
gettimeofday(&t0, NULL);
if (!is_pingpong) {
int off = 0;
for (int i = 0; i < tiles; i++) {
// NOTICE: load each loop
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_kernel, kernel_data);
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_bias, bias_data);
load(rt_ctx, cvk_ctx, tl_ifmap2[curr], input_data2 + off, tl_ifmap3[curr], input_data3 + off,
tl_ofmap[curr], _ref_data + off, tl_mask[curr], mask + off, tl_update_tbl[curr],
update_tbl + off);
int ret = testbenchs[testcase_idx].cvm_run(
cvk_ctx, tl_ifmap2[curr], tl_ifmap3[curr], tl_buf[curr], tl_mask[curr],
tl_update_tbl[curr], tl_kernel, tl_bias, threshold, w1, w2, tl_ofmap[curr]);
if (ret) {
fflush(stderr);
printf("%s", "generate commands fail, return\n");
exit(-1);
}
store(rt_ctx, cvk_ctx, testbenchs[testcase_idx].name, tl_ofmap[curr], tpu_output_data + off,
tl_mask[curr], tpu_output_mask + off, shape_sz);
off += shape_sz;
}
} else {
// TODO: not load at once
int operand_num = 1;
int input_flip = 0;
int output_flip = 0;
for (int i = 0; i < tiles + 2; i++) {
cvk_ctx->ops->parallel_enable(cvk_ctx);
// NOTICE: load each loop
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_kernel, kernel_data);
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_bias, bias_data);
// send device memory to sram
if ((i - 2) >= 0 && (i - 2) % operand_num == operand_num - 1) {
int curr = 1 - output_flip;
store(rt_ctx, cvk_ctx, testbenchs[testcase_idx].name, tl_ofmap[curr],
tpu_output_data + store_offset, tl_mask[curr], tpu_output_mask + store_offset,
shape_sz);
store_offset += shape_sz;
}
if (i - 1 >= 0 && i - 1 < tiles) {
// get data from tl
int curr = 1 - input_flip;
// prepare command buffer
ret = testbenchs[testcase_idx].cvm_run(
cvk_ctx, tl_ifmap2[curr], tl_ifmap3[curr], tl_buf[curr], tl_mask[curr],
tl_update_tbl[curr], tl_kernel, tl_bias, threshold, w1, w2, tl_ofmap[curr]);
if (ret) {
fflush(stderr);
printf("%s", "generate commands fail, return\n");
exit(-1);
}
output_flip = 1 - output_flip;
}
if (i < tiles) {
load(rt_ctx, cvk_ctx, tl_ifmap2[input_flip], input_data2 + load_offset,
tl_ifmap3[input_flip], input_data3 + load_offset, tl_ofmap[input_flip],
_ref_data + load_offset, tl_mask[input_flip], mask + load_offset,
tl_update_tbl[input_flip], update_tbl + load_offset);
load_offset += shape_sz;
input_flip = 1 - input_flip;
}
cvk_ctx->ops->parallel_disable(cvk_ctx);
}
}
// submit descriptor
test_submit_comp(rt_ctx, cvk_ctx);
gettimeofday(&t1, NULL);
elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
// compare with reference with byte
debug_print("%s comparing...", testbenchs[testcase_idx].name);
for (uint32_t i = 0; i < (uint32_t)ofmap_bytesize; i++) {
if (ref_data[i] != tpu_output_data[i]) {
debug_print("comparing failed output[%u] got %u, ref %u\n", i, tpu_output_data[i],
ref_data[i]);
// fail case
fflush(stderr);
exit(-1);
}
}
// compare another export information
if (!strcmp(testbenchs[testcase_idx].name, "update_tbl_by_threshold")) {
for (uint32_t i = 0; i < (uint32_t)shape_sz; i++) {
if (mask[i] != tpu_output_mask[i]) {
debug_print("comparing mask failed output[%u] got %u, ref %u\n", i, tpu_output_mask[i],
mask[i]);
// fail case
fflush(stderr);
exit(-1);
}
}
}
if (tiles == 1) {
debug_print("%s", " pass\n");
} else {
// get elapsed time
debug_print("(takes %ld us)\n", elapsed);
}
// free resource from tpu memory
for (int i = alloc_nr - 1; i >= 0; --i) {
free_tl(cvk_ctx, tl_buf[i]);
free_tl(cvk_ctx, tl_update_tbl[i]);
free_tl(cvk_ctx, tl_mask[i]);
free_tl(cvk_ctx, tl_ofmap[i]);
free_tl(cvk_ctx, tl_ifmap3[i]);
free_tl(cvk_ctx, tl_ifmap2[i]);
}
free_tl(cvk_ctx, tl_bias);
free_tl(cvk_ctx, tl_kernel);
// free resource from host memory
// free(input_data);
free(ref_data);
free(tpu_output_data);
free(tpu_output_mask);
free(input_data2);
free(input_data3);
free(mask);
free(update_tbl);
free(_ref_data);
free(kernel_data);
free(bias_data);
}
int main() {
CVI_RT_HANDLE rt_ctx;
cvk_context_t *cvk_ctx;
// init runtime / kerenl structure
test_init(&rt_ctx, &cvk_ctx);
cvk_tg_shape_t tg_shape = {1, 20, 3, 4};
// run test
int testbench_nr = sizeof(testbenchs) / sizeof(testbenchs[0]);
for (int i = 0; i < testbench_nr; i++) {
testbench(&rt_ctx, cvk_ctx, &tg_shape, i);
}
#if 1
// run test without ping-pong
tg_shape = {1, 128, 340, 16};
printf("[heavy data] w/o ping pong\n");
// NOTICE: only check c
int tiles = std::ceil(tg_shape.c / (float)cvk_ctx->info.npu_num);
if (tg_shape.c > cvk_ctx->info.npu_num) {
debug_print("tile nr %d channel base one npu nr %d\n", tiles, cvk_ctx->info.npu_num);
}
for (int i = 0; i < testbench_nr; i++) {
testbench(&rt_ctx, cvk_ctx, &tg_shape, i);
}
tg_shape = {1, 128, 340, 16};
printf("[heavy data] w/ ping pong\n");
for (int i = 0; i < testbench_nr; i++) {
testbench(&rt_ctx, cvk_ctx, &tg_shape, i, /*is_pingpong=*/true);
}
#endif
// de-init runtime / kerenl structure
test_exit(&rt_ctx, cvk_ctx);
printf("all pass\n");
return 0;
}

View File

@ -0,0 +1,165 @@
// \file implement activation function(sigmoid) by interpolation lookup table,
// please refer [here](https://en.wikipedia.org/wiki/Linear_interpolation) for more details
// header include
#include <assert.h>
#include <cvimath_internal.h> // math
#include <test_cvikernel_util.h> // kerenl
// ========== user config ============
#define MAX_ERROR (0.004) // tolerance
// for current example, we quauntize data to -8 ~ +8
// range depend on ur activation
static int range_start = -8;
static int range_end = 8;
// ========== end of user config ============
// gen reference by cpu
static double _gen_sigmoid(float x) { return 1.0 / (1.0 + exp(-(x))); }
// gen reference
static void gen_ref(uint16_t *ofmap, uint16_t *ifmap, cvk_tl_shape_t ifmap_shape) {
for (uint64_t i = 0; i < tl_shape_size(&ifmap_shape); i++) {
ofmap[i] = convert_fp32_bf16(_gen_sigmoid(convert_bf16_fp32(ifmap[i])));
}
}
// verify cpu data with tpu
static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint64_t ofmap_size) {
int count = 0;
uint64_t size = ofmap_size;
for (uint64_t i = 0; i < size; i++) {
float got = convert_bf16_fp32(ofmap_data[i]);
float exp = convert_bf16_fp32(ref_data[i]);
if (fabs(got - exp) > MAX_ERROR) {
fprintf(stderr,
"[%d] comparing failed at ofmap_data[%u], got %x, exp %x, "
"diff(%f - %f) is %f\n",
count, (uint32_t)i, ofmap_data[i], ref_data[i], got, exp, fabs(got - exp));
count++;
}
}
// exit if fail
if (count != 0) {
printf("error count is %d\n", count);
exit(-1);
}
return true;
}
// gen random input for test
static void gen_input(uint16_t *ifmap, uint64_t ifmap_size) {
int table_hw = 256;
for (uint64_t i = 0; i < ifmap_size; i++) {
// input range is -8 ~ +8
float input = ((int)i % 7) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
ifmap[i] = convert_fp32_bf16(input);
}
}
// main code for test sigmoid interpolate implement by lookup table
static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) {
// example for input tensor
cvk_tl_shape_t ifmap_shape = {1, 32, 16, 16};
cvk_fmt_t fmt = CVK_FMT_BF16;
// get table / input shape
cvk_tl_shape_t table_shape;
cvm_table_shape(bmk, &table_shape);
cvk_tl_shape_t ofmap_shape = ifmap_shape;
uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
uint64_t table_size = tl_shape_size(&table_shape);
uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
// get table/input size
int data_type_size = 1;
if (fmt == CVK_FMT_BF16) {
// bf16 takes 2 bytes
data_type_size = 2;
}
uint64_t ifmap_bytesize = ifmap_size * data_type_size;
uint64_t table_bytesize = table_size * data_type_size;
uint64_t ofmap_bytesize = ofmap_size * data_type_size;
// alloc host memory
uint16_t *ifmap = (uint16_t *)xmalloc(ifmap_bytesize);
uint16_t *table_data = (uint16_t *)xmalloc(table_bytesize);
uint16_t *table_data_slope = (uint16_t *)xmalloc(table_bytesize);
uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
// gen input and assign data in host
gen_input(ifmap, ifmap_size);
// gen table, interpolation need 2 tables, one for lookup, another one is slope
cvm_sigmoid_tbl(table_data, table_data_slope, &table_shape, range_start, range_end);
// gen reference
gen_ref(ref_data, ifmap, ofmap_shape);
// alloc input / output / tmp / lookup table / slope table
cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
cvk_tl_t *cvk_tl_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *cvk_tl_table_answer_slope = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
// device memory load to local memory
test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)ifmap);
test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer, (uint8_t *)table_data);
test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer_slope, (uint8_t *)table_data_slope);
// get quantize(scale) value
float scale = cvm_sigmoid_scale(range_start, range_end);
// emit core function
cvm_emit_sigmoid(bmk, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_slope,
tl_ofmap_bf16, scale);
// get result from device to host
uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, tl_ofmap_bf16);
// verify data with tolerance
verify(ofmap_data, ref_data, ofmap_size);
// release device memory in revert order
free_tl(bmk, tl_ofmap_bf16);
free_tl(bmk, tl_buf);
free_tl(bmk, cvk_tl_table_answer_slope);
free_tl(bmk, cvk_tl_table_answer);
free_tl(bmk, tl_ifmap);
// release host memory
free(ifmap);
free(table_data);
free(table_data_slope);
free(ref_data);
free(ofmap_data);
}
int main() {
CVI_RT_HANDLE ctx;
cvk_context_t *bmk;
int round_mode;
round_mode = set_store_feround();
// init runtime / kerenl structure
test_init(&ctx, &bmk);
// emit test case
testbench(&ctx, bmk);
// de-init runtime / kerenl structure
test_exit(&ctx, bmk);
// restore rounding
restore_feround(round_mode);
return 0;
}

View File

@ -0,0 +1,145 @@
// \file sample for set value by mask, plz refer \cvimath_internal.h for more details
// header include
#include <assert.h>
#include <cvimath_internal.h> // math
#include <test_cvikernel_util.h> // kerenl
static void init_input(uint8_t *input_data, uint64_t ifmap_size) {
for (uint64_t i = 0; i < ifmap_size; i++) {
input_data[i] = i;
}
}
static void init_weight(uint8_t *weight_data, uint64_t weight_size) {
for (uint64_t i = 0; i < weight_size; i++) {
weight_data[i] = 1; // NOTICE: MUST init as 1 under nearest upsample case
}
}
static int init_ref(uint8_t *input, uint8_t *output, int n, int c, int ih, int iw, int scale_h,
int scale_w) {
int h = ih * scale_h;
int w = iw * scale_w;
for (int ni = 0; ni < n; ni++) {
for (int ci = 0; ci < c; ci++) {
for (int hi = 0; hi < h; hi++) {
for (int wi = 0; wi < w; wi++) {
int nwi = wi / scale_w;
int nhi = hi / scale_h;
int out_idx = (((ni * c + ci) * h) + hi) * w + wi;
int in_idx = (((ni * c + ci) * (h / scale_h)) + nhi) * (w / scale_w) + nwi;
output[out_idx] = input[in_idx];
}
}
}
}
return 0;
}
static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, cvk_tg_shape_t *tg_shape) {
// for calculate size we need in host
cvk_tl_shape_t ifmap_shape = {tg_shape->n, tg_shape->c, tg_shape->h, tg_shape->w};
// upsample scale, e.g: scale_h = 3,scale_w 2, input h = 4, input w = 5
// output h is 4 * 3 = 12, output w is 5 * 2 = 10 with nearest
int scale_h = 3;
int scale_w = 2;
// set output shape
cvk_tl_shape_t ofmap_shape = ifmap_shape;
ofmap_shape.h = ofmap_shape.h * scale_h;
ofmap_shape.w = ofmap_shape.w * scale_w;
cvk_tl_shape_t weight_shape = ifmap_shape;
weight_shape.h = scale_h;
weight_shape.w = scale_w;
uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
uint64_t weight_size = tl_shape_size(&weight_shape);
// unit size is 1 bytes for int/uint 8
int data_type_size = 1;
// get input/output size
uint64_t ifmap_bytesize = ifmap_size * data_type_size;
uint64_t ofmap_bytesize = ofmap_size * data_type_size;
uint64_t weight_bytesize = weight_size * data_type_size;
// alloc on ddr
// uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize);
uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize);
uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize);
uint8_t *weight_data = (uint8_t *)xmalloc(weight_bytesize);
// init input / output data in ddr
init_input(input_data, ifmap_size);
init_weight(weight_data, weight_bytesize); // fix pattern
init_ref(input_data, ref_data, ifmap_shape.n, ifmap_shape.c, ifmap_shape.h, ifmap_shape.w,
scale_h, scale_w);
// alloc on sram
cvk_fmt_t fmt = CVK_FMT_I8;
int eu_align = 1;
cvk_tl_t *tl_ifmap = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
cvk_tl_t *tl_weight = test_alloc_tl(cvk_ctx, weight_shape, fmt, eu_align);
cvk_tl_t *tl_ofmap = test_alloc_tl(cvk_ctx, ofmap_shape, fmt, eu_align);
// send device memory to sram
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ifmap, input_data);
test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_weight, weight_data);
// generate descriptor
cvm_upsample2d(cvk_ctx, tl_ifmap, tl_weight, tl_ofmap);
// submit descriptor
test_submit_comp(rt_ctx, cvk_ctx);
// get data from tl
uint8_t *ofmap_data = (uint8_t *)test_get_tensor_l2g_comp(rt_ctx, cvk_ctx, tl_ofmap);
// compare with reference with byte
for (uint32_t i = 0; i < ofmap_size; i++) {
if (ref_data[i] != ofmap_data[i]) {
fprintf(stderr, "comparing failed output[%u] got %u, ref %u\n", i, ofmap_data[i],
ref_data[i]);
// fail case
fflush(stderr);
exit(-1);
}
}
// free resource from tpu memory
free_tl(cvk_ctx, tl_ofmap);
free_tl(cvk_ctx, tl_weight);
free_tl(cvk_ctx, tl_ifmap);
// free resource from host memory
free(ref_data);
free(weight_data);
free(ofmap_data);
free(input_data);
}
int main() {
CVI_RT_HANDLE rt_ctx;
cvk_context_t *cvk_ctx;
// init runtime / kerenl structure
test_init(&rt_ctx, &cvk_ctx);
cvk_tg_shape_t tg_shape = {1, 20, 3, 4};
// cvk_tg_shape_t tg_shape = {1, 20, 3, 40};
// run test
testbench(&rt_ctx, cvk_ctx, &tg_shape);
// de-init runtime / kerenl structure
test_exit(&rt_ctx, cvk_ctx);
printf("pass\n");
return 0;
}

View File

@ -0,0 +1,293 @@
#ifndef ATOMIC_FP_H_
#define ATOMIC_FP_H_
#if __arm__
#define __DISABLE_FENV__
#endif
#ifndef __DISABLE_FENV__
#include <fenv.h>
#endif
#include <math.h>
#include <stdint.h> // uint8_t / uint16_t
#ifdef __cplusplus
extern "C" {
#endif
uint8_t convert_bf16_u8(uint16_t data);
uint8_t _convert_bf16_u8(uint16_t data, int int8_rnd_md);
int8_t _convert_bf16_s8(uint16_t data, int int8_rnd_md);
int8_t convert_bf16_s8(uint16_t data);
uint16_t convert_int8_bf16(uint8_t data, uint8_t sign);
uint32_t convert_fp32_u32(float fp32);
uint32_t convert_fp32_hex(float val);
float convert_hex_fp32(uint32_t hval);
float convert_bf16_fp32(uint16_t bf16);
uint16_t convert_fp32_bf16(float fp32);
void f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed,
int int8_rnd_md);
// void f32_integer(void *if32, void *o_integer,
// 0 for 32 bit , 1 for 16 bit , 2 for 8 bit
// int integer_size, int accumulate = 0, int int8_signed = 1, int int8_rnd_md = 0);
union convert_type_float {
float fval;
uint16_t bf16[2];
uint32_t ival;
};
typedef union convert_type_float convert_int_float;
static const uint16_t NAN_VALUE = 0x7FC0;
// static int round_mode = 0;
static uint8_t float_isnan(const float x) {
// return isnan(x);
return x != x;
}
int set_store_feround() {
#ifndef __DISABLE_FENV__
int round_mode = fegetround();
fesetround(FE_TOWARDZERO);
return round_mode;
#else
return 0;
#endif
}
void restore_feround(int round_mode) {
#ifndef __DISABLE_FENV__
fesetround(round_mode);
#else
(void)round_mode;
#endif
}
uint8_t _convert_bf16_u8(uint16_t data, int int8_rnd_md) {
/* convert bf16 to float32*/
float fp32;
convert_int_float convert_val;
fp32 = convert_bf16_fp32(data);
/* convert float32 to uint8_t*/
f32_integer((void *)&fp32, &convert_val.ival, 2, 0, 0, int8_rnd_md);
return (uint8_t)convert_val.ival;
}
uint8_t convert_bf16_u8(uint16_t data) { return (uint8_t)_convert_bf16_u8(data, 0); }
int8_t _convert_bf16_s8(uint16_t data, int int8_rnd_md) {
/* convert bf16 to float32*/
float fp32;
convert_int_float convert_val;
fp32 = convert_bf16_fp32(data);
/* convert float32 to uint8_t*/
f32_integer((void *)&fp32, &convert_val.ival, 2, 0, 1, int8_rnd_md);
return (int8_t)convert_val.ival;
}
int8_t convert_bf16_s8(uint16_t data) { return (int8_t)_convert_bf16_s8(data, 0); }
uint16_t convert_int8_bf16(uint8_t data, uint8_t sign) {
int32_t val = sign ? (int8_t)data : (uint8_t)data;
/* need to round to bf16 mode */
return convert_fp32_bf16((float)val);
}
uint16_t convert_fp32_bf16(float fp32) {
if (float_isnan(fp32)) return NAN_VALUE;
convert_int_float convert_val;
convert_val.fval = fp32;
uint32_t input = convert_val.ival;
uint32_t lsb = (input >> 16) & 1;
uint32_t rounding_bias = 0x7fff + lsb;
input += rounding_bias;
convert_val.bf16[1] = (uint16_t)(input >> 16);
/* HW behavior */
if ((convert_val.bf16[1] & 0x7f80) == 0x7f80) {
convert_val.bf16[1] = 0x7f7f;
}
return convert_val.bf16[1];
}
uint8_t convert_fp32_u8(float fp32) {
convert_int_float convert_val;
f32_integer((void *)&fp32, &convert_val.ival, 2, 0, 0, 0);
return (uint8_t)convert_val.ival;
}
int8_t convert_fp32_s8(float fp32) {
convert_int_float convert_val;
f32_integer((void *)&fp32, &convert_val.ival, 2, 0, 1, 0);
return (int8_t)convert_val.ival;
}
uint32_t convert_fp32_u32(float fp32) {
convert_int_float convert_val;
f32_integer((void *)&fp32, &convert_val.ival, 0, 0, 0, 0);
return (uint32_t)convert_val.ival;
}
int32_t convert_fp32_s32(float fp32) {
convert_int_float convert_val;
f32_integer((void *)&fp32, &convert_val.ival, 0, 0, 1, 0);
return (int32_t)convert_val.ival;
}
/* convert hex to float directly */
float convert_hex_fp32(uint32_t hval) {
convert_int_float convert_val;
convert_val.ival = hval;
return convert_val.fval;
}
/* convert float to hex directly */
uint32_t convert_fp32_hex(float val) {
convert_int_float convert_val;
convert_val.fval = val;
return convert_val.ival;
}
float convert_bf16_fp32(uint16_t bf16) {
convert_int_float convert_val;
convert_val.bf16[1] = bf16;
convert_val.bf16[0] = 0;
return convert_val.fval;
}
void flt2int_flt(float x, unsigned long long *integer_part, float *sub_part, uint8_t sign) {
convert_int_float work_x;
int level_code;
unsigned long tail_code;
work_x.fval = x;
level_code = ((work_x.ival >> 23) & 0xff) - 127;
// if the level code is negaive, the integer part of the float is zero
if (level_code < 0) {
*integer_part = 0;
*sub_part = x;
} else {
tail_code = (work_x.ival) & 0x7fffff;
tail_code = tail_code | 0x800000;
if (level_code < 23) {
tail_code >>= (23 - level_code);
*integer_part = tail_code;
work_x.ival &= 0xffffffff << (23 - level_code);
*sub_part = x - work_x.fval;
} else {
tail_code <<= (level_code - 23);
*integer_part = tail_code;
if (level_code > 30) {
*integer_part = 0x7fffffff;
if (sign) *integer_part = 0x800000000;
}
*sub_part = 0;
}
}
}
inline static int flt2int(float ifval, int int8_rnd_md) {
union {
float floatNum;
unsigned long intNum;
} tempIfval;
tempIfval.floatNum = ifval;
uint8_t isPositive = ((tempIfval.intNum & 0x80000000UL) == 0x80000000UL) ? 0 : 1;
float abs_fval = (!isPositive) ? -ifval : ifval;
double sub_part;
double integer;
unsigned long long integer_part;
// uint8_t sign = !isPositive;
// flt2int_flt(abs_fval, &integer_part, &sub_part, sign);
sub_part = modf((double)abs_fval, &integer);
integer_part = (unsigned long long)integer;
if (!isPositive) {
unsigned long long result;
if (int8_rnd_md == 0) { // round to nearest even
if (sub_part > 0.5f) {
result = integer_part + 1;
} else if (sub_part == 0.5f) {
if (integer_part & 0x1) {
result = integer_part + 1;
} else {
result = integer_part;
}
} else {
result = integer_part;
}
} else { // round to zero
result = integer_part;
}
if (result > 0x80000000UL) {
result = 0x80000000UL;
}
return -result;
} else {
unsigned long long result;
if (int8_rnd_md == 0) { // round to nearest even
if (sub_part > 0.5f) {
result = integer_part + 1;
} else if (sub_part == 0.5f) {
if (integer_part & 0x1) {
result = integer_part + 1;
} else {
result = integer_part;
}
} else {
result = integer_part;
}
} else {
result = integer_part;
}
if (result > 0x7fffffff) {
result = 0x7fffffff;
}
return result;
}
}
void f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed,
int int8_rnd_md) {
int i_tmp;
float *f_tmp;
f_tmp = (float *)if32;
i_tmp = flt2int(*f_tmp, int8_rnd_md);
int *o32 = (int *)o_integer;
int dst_f32 = *o32;
short *o16 = (short *)o_integer;
short dst_o16 = *o32;
char *o8 = (char *)o_integer;
char dst_o8 = *o8;
if (integer_size == 0) {
*o32 = i_tmp;
} else if (integer_size == 1) {
*o16 = i_tmp;
} else {
*o8 = i_tmp;
int min = (int8_signed) ? -128 : 0;
int max = (int8_signed) ? 127 : 255;
if (i_tmp < min) {
*o8 = min;
} else if (i_tmp > max) {
*o8 = max;
}
//*o8 = i_tmp;
}
if (accumulate) {
if (integer_size == 0) {
*o32 += dst_f32;
} else if (integer_size == 1) {
*o16 += dst_o16;
} else
*o8 += dst_o8;
}
}
#ifdef __cplusplus
}
#endif
#endif /* ATOMIC_FP_H_ */

View File

@ -0,0 +1,12 @@
project(cvimath)
include_directories(${CMAKE_SOURCE_DIR}/include)
file(GLOB SRC ./*.c ./*.cpp)
add_library(${PROJECT_NAME} SHARED ${SRC})
target_link_libraries(${PROJECT_NAME} ${TPU_KERNEL_LIB})
install(TARGETS ${PROJECT_NAME} DESTINATION lib)
add_library(${PROJECT_NAME}-static STATIC ${SRC})
target_link_libraries(${PROJECT_NAME}-static ${TPU_KERNEL_LIB})
install(TARGETS ${PROJECT_NAME}-static DESTINATION lib)

1361
cvimath/src/bf16_gemm.c Normal file

File diff suppressed because it is too large Load Diff

82
cvimath/src/blas_cpu.cpp Normal file
View File

@ -0,0 +1,82 @@
#include <cvimath_internal.h>
#include <bits/stdc++.h>
#ifdef __ARM_ARCH
#include <arm_neon.h>
#endif
template <typename T>
void k_selection_sort_index(T *array, uint32_t *index, T *value, const uint32_t array_size,
const uint32_t k) {
for (uint32_t i = 0; i < k; i++) {
int largest = 0;
for (uint32_t j = 0; j < array_size; j++) {
if (array[j] > array[largest]) {
largest = j;
}
}
value[i] = array[largest];
index[i] = largest;
array[largest] = 0;
}
}
inline uint32_t dot(uint8_t *a, uint8_t *b, uint32_t data_length) {
uint32_t dot_result = 0;
for (uint32_t i = 0; i < data_length; i++) {
dot_result += ((short)a[i] * b[i]);
}
return dot_result;
}
inline int32_t dot_i8(int8_t *a, int8_t *b, uint32_t data_length) {
int32_t dot_result = 0;
for (uint32_t i = 0; i < data_length; i++) {
dot_result += ((short)a[i] * b[i]);
}
return dot_result;
}
void cvm_gen_precached_i8_unit_length(int8_t *precached, float *unit_precached_arr,
const uint32_t data_length, const uint32_t data_num) {
for (uint32_t i = 0; i < data_num; i++) {
int8_t *fb_offset = precached + i * data_length;
unit_precached_arr[i] = dot_i8(fb_offset, fb_offset, data_length);
unit_precached_arr[i] = sqrt(unit_precached_arr[i]);
}
}
void cvm_gen_precached_u8_unit_length(uint8_t *precached, float *unit_precached_arr,
const uint32_t data_length, const uint32_t data_num) {
for (uint32_t i = 0; i < data_num; i++) {
uint8_t *fb_offset = precached + i * data_length;
unit_precached_arr[i] = dot(fb_offset, fb_offset, data_length);
unit_precached_arr[i] = sqrt(unit_precached_arr[i]);
}
}
void cvm_cpu_i8data_ip_match(int8_t *feature, int8_t *precached, float *unit_precached_arr,
uint32_t *k_index, float *k_value, float *buffer,
const uint32_t data_length, const uint32_t data_num,
const uint32_t k) {
float unit_feature = (float)dot_i8(feature, feature, data_length);
unit_feature = sqrt(unit_feature);
for (uint32_t i = 0; i < data_num; i++) {
buffer[i] = dot_i8(feature, precached + i * data_length, data_length) /
(unit_feature * unit_precached_arr[i]);
}
k_selection_sort_index(buffer, k_index, k_value, data_num, k);
}
void cvm_cpu_u8data_ip_match(uint8_t *feature, uint8_t *precached, float *unit_precached_arr,
uint32_t *k_index, float *k_value, float *buffer,
const uint32_t data_length, const uint32_t data_num,
const uint32_t k) {
float unit_feature = (float)dot(feature, feature, data_length);
unit_feature = sqrt(unit_feature);
for (uint32_t i = 0; i < data_num; i++) {
buffer[i] = dot(feature, precached + i * data_length, data_length) /
(unit_feature * unit_precached_arr[i]);
}
k_selection_sort_index(buffer, k_index, k_value, data_num, k);
}

118
cvimath/src/chl_quan.cpp Normal file
View File

@ -0,0 +1,118 @@
#include <cvimath_internal.h>
#include <assert.h>
#include <limits.h>
#include <math.h>
#include <iostream>
void cvm_get_chl_quan(float real_multiplier, uint32_t *quantized_multiplier, int *right_shift) {
if (real_multiplier <= 0.f || real_multiplier > 1.f) {
std::cerr << "Multiplier should be bigger than 0, smaller or euqal to 1." << std::endl;
*quantized_multiplier = 0;
*right_shift = 0;
return;
} else if (real_multiplier == 1.f) {
*quantized_multiplier = (uint32_t)(1ll << 31) - 1;
*right_shift = 0;
} else {
int s = 0;
// We want to bring the real multiplier into the interval [1/2, 1).
// We can do so by multiplying it by two, and recording how many times
// we multiplied by two so that we can compensate that by a right
// shift by the same amount.
while (real_multiplier < 0.5f) {
real_multiplier *= 2.0f;
s++;
}
// Now that the real multiplier is in [1/2, 1), we convert it
// into a fixed-point number.
int64_t q = static_cast<int64_t>(round(real_multiplier * (1ll << 31)));
assert(q <= (1ll << 31));
// Handle the special case when the real multiplier was so close to 1
// that its fixed-point approximation was undistinguishable from 1.
// We handle this by dividing it by two, and remembering to decrement
// the right shift amount.
if (q == (1ll << 31)) {
q /= 2;
s--;
}
assert(s >= 0);
assert(q <= (int64_t)LONG_MAX);
*quantized_multiplier = (uint32_t)q;
*right_shift = s;
}
}
inline void cvm_pack_per_chan_cal_data(uint32_t channels, bool has_bias, int32_t *bias,
uint32_t *multiplier, int8_t *shift, uint8_t *packed_data) {
uint8_t *ptr = packed_data;
for (uint32_t i = 0; i < channels; i++) {
if (has_bias) {
uint32_t val = (uint32_t)bias[i];
*ptr = val & 0xff;
ptr++;
*ptr = (val >> 8) & 0xff;
ptr++;
*ptr = (val >> 16) & 0xff;
ptr++;
*ptr = (val >> 24) & 0xff;
ptr++;
}
{
uint32_t val = multiplier[i];
*ptr = val & 0xff;
ptr++;
*ptr = (val >> 8) & 0xff;
ptr++;
*ptr = (val >> 16) & 0xff;
ptr++;
*ptr = (val >> 24) & 0xff;
ptr++;
}
{
uint8_t val = shift[i];
*ptr = val;
ptr++;
}
}
}
void cvm_fill_chl_quan_data(const uint32_t c, const uint32_t quantized_multiplier,
const int right_shift, uint8_t *cal_data, int32_t *bias_data,
bool has_bias) {
// Create tl_multiplier
uint32_t *multiplier_data = new uint32_t[c];
int8_t *shift_data = new int8_t[c];
for (unsigned int i = 0; i < c; ++i) {
// multipliers typically range in [2^30 ; 2^31 - 1].
// Values in [0, 2^30 - 1] are normally unused, but harmless.
// Thus a good way to randomize multipliers is to subtract from them
// a random value smaller than 2^30 but still significant compared to it.
multiplier_data[i] = quantized_multiplier;
// Our H/W only supports right shift
shift_data[i] = right_shift > 0 ? right_shift : 0;
#ifdef ENABLE_DEBUG_MSG
printf(" [oc=%d] multiplier_data %d, shift_data %d\n", i, p_param->multiplier_data[i],
p_param->shift_data[i]);
#endif
}
cvm_pack_per_chan_cal_data(c, has_bias, bias_data, multiplier_data, shift_data, cal_data);
delete[] multiplier_data;
delete[] shift_data;
}
uint8_t *cvm_get_chl_quan_data(const uint32_t c, const uint32_t quantized_multiplier,
const int &right_shift, int32_t *bias_data, bool has_bias) {
const int per_chan_cal_data_size =
has_bias ? CVK_MULTIPLIER_BIAS_PACKED_DATA_SIZE : CVK_MULTIPLIER_ONLY_PACKED_DATA_SIZE;
const int cal_data_size = c * per_chan_cal_data_size;
uint8_t *cal_data = (uint8_t *)malloc(cal_data_size);
cvm_fill_chl_quan_data(c, quantized_multiplier, right_shift, cal_data, bias_data, has_bias);
return cal_data;
}

1032
cvimath/src/common.c Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,138 @@
#include <cvimath_internal.h>
#include "gen_lut.h"
// only fill base_reg_index/int8_rnd_mode
static void init_tgmem(cvk_tg_t* t) {
t->base_reg_index = 0;
t->int8_rnd_mode = 0;
}
int cvm_s2s_fp32_bf16(cvk_context_t* ctx, uint64_t gaddr_fp32, cvk_tg_shape_t fp32_shape,
uint64_t gaddr_bf16, cvk_tg_shape_t cvm_shape, cvk_fmt_t fmt) {
int ret = 0;
ASSERT(fmt == CVK_FMT_BF16 && "only support CVK_FMT_BF16");
ASSERT(fp32_shape.w % 2 == 0 && "fp32's w MUST align with 2");
cvk_tdma_g2g_tensor_copy_param_t p;
cvk_tg_t src, dst;
init_tgmem(&src);
init_tgmem(&dst);
int fp32_w = 2;
src.fmt = fmt;
src.start_address = gaddr_fp32 + fp32_w; // copy from high part
src.shape = fp32_shape;
src.shape.h = fp32_shape.w * fp32_shape.h / fp32_w;
src.shape.w = 1;
int fmt_sz = bytesize_of_fmt(fmt);
src.stride.n = fp32_shape.w * fp32_shape.h * fp32_shape.c * fmt_sz;
src.stride.c = fp32_shape.w * fp32_shape.h * fmt_sz;
src.stride.h = fp32_w * fmt_sz;
dst.fmt = fmt;
dst.start_address = gaddr_bf16;
dst.shape = cvm_shape;
dst.shape.h = cvm_shape.w * cvm_shape.h / fp32_w;
dst.shape.w = 1;
dst.stride = ctx->ops->tg_default_stride(ctx, dst.shape, dst.fmt);
p.src = &src;
p.dst = &dst;
ctx->ops->tdma_g2g_bf16_tensor_copy(ctx, &p);
return ret;
}
// default implement by s->s
void cvm_bf16_fp32(cvk_context_t* cvk_ctx, cvk_tg_t* tg_bf16, cvk_tg_t* tg_fp32) {
#if 0
// sys->local->sys implement
cvk_fmt_t fmt = tg_bf16->fmt;
cvk_tl_shape_t tl_shape;
int ctrl = CTRL_AL; // eu align
tl_shape.n = tg_fp32->shape.n;
tl_shape.c = tg_fp32->shape.c;
tl_shape.h = tg_fp32->shape.h;
tl_shape.w = tg_fp32->shape.w;
// 1. fill local memory to 0 for mantissa
cvk_tl_t *tl_ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, tg_bf16->fmt, ctrl);
cvk_tiu_mul_param_t p0;
p0.res_high = NULL;
p0.res_low = tl_ofmap;
p0.a = tl_ofmap;
p0.b_is_const = 1;
p0.b_const.val = 0;
p0.b_const.is_signed = 0;
p0.rshift_bits = 0;
p0.relu_enable = 0;
p0.layer_id = 0;
cvk_ctx->ops->tiu_mul(cvk_ctx, &p0);
// pretend the same shape, reshape h, w to h * w, 1
int fmt_bytesize = cvm_bytesize_of_fmt(tl_ofmap->fmt);
tl_ofmap->shape.w = 1;
tl_ofmap->shape.h = tg_bf16->shape.h * tg_bf16->shape.w;
tl_ofmap->stride.h = 4;
tl_ofmap->stride.c = align_up(tg_fp32->shape.w * tg_fp32->shape.h * fmt_bytesize,
cvk_ctx->info.eu_num);
tl_ofmap->stride.n = tl_ofmap->stride.c * ceiling_func(tg_fp32->shape.c,
cvk_ctx->info.npu_num);
// 2. load from tg with reshaped w
// FIXME: check overwrite
tl_ofmap->start_address = tl_ofmap->start_address + 2;// 2 means shift fp32 high 16 part
cvk_tdma_g2l_tensor_copy_param_t p;
p.src = tg_bf16;
p.dst = tl_ofmap;
cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &p);
// 3. store back to tg
tl_ofmap->start_address = tl_ofmap->start_address - 2; //revert
tl_ofmap->shape = tl_shape;
tl_ofmap->stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_ofmap->shape, fmt, ctrl);
cvk_tdma_l2g_tensor_copy_param_t p1;
p1.src = tl_ofmap;
p1.dst = tg_fp32;
cvk_ctx->ops->tdma_l2g_bf16_tensor_copy(cvk_ctx, &p1);
cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ofmap);
#else
// sys->sys implement
// 1. fill tg with low 16but part as 0
cvk_tdma_l2g_tensor_fill_constant_param_t p0;
p0.constant = 0;
p0.dst = tg_fp32;
p0.layer_id = 0;
cvk_ctx->ops->tdma_l2g_tensor_fill_constant(cvk_ctx, &p0);
// 2. sys->sys
cvk_tdma_g2g_tensor_copy_param_t p1;
cvk_tg_shape_t shape = tg_fp32->shape; // backup
cvk_tg_stride_t stride = tg_fp32->stride;
tg_fp32->shape.w = 1;
tg_fp32->shape.h = tg_bf16->shape.h * tg_bf16->shape.w;
tg_fp32->stride.h = 4;
tg_fp32->start_address = tg_fp32->start_address + 2; // +2 means shift from high part
p1.src = tg_bf16;
p1.dst = tg_fp32;
p1.layer_id = 0;
cvk_ctx->ops->tdma_g2g_bf16_tensor_copy(cvk_ctx, &p1);
// restore
tg_fp32->start_address = tg_fp32->start_address - 2;
tg_fp32->shape = shape;
tg_fp32->stride = stride;
#endif
}

207
cvimath/src/gen_lut.h Normal file
View File

@ -0,0 +1,207 @@
#ifndef GEN_LUT_1880v2_H
#define GEN_LUT_1880v2_H
#include <assert.h>
#include <cvimath_internal.h>
#include <test_cvikernel_util.h>
#define IN
#define OUT
#define ASSERT(x) assert(x)
static inline int cvm_exp_start() { return -62; }
static inline int cvm_exp_end() { return 63; }
static inline int cvm_table_h() { return 32; }
static inline int cvm_table_w() { return 8; }
static inline int cvm_table_hw() { return cvm_table_h() * cvm_table_w(); }
static inline int half_h_table() { return cvm_table_h() * cvm_table_w() / 2; }
static inline bool is_1880v2_tbl_shape(cvk_tl_shape_t *s) {
// FIXME: h could be reduce less than 32
assert(s->h == (uint32_t)cvm_table_h() && s->w == (uint32_t)cvm_table_w() &&
"table h/w should be 32/8");
return s->h == (uint32_t)cvm_table_h() && s->w == (uint32_t)cvm_table_w();
}
// copy cvk_tl_t structure
static inline void bmk1880v2_tensor_lmem_s_copy(cvk_tl_t *dst, cvk_tl_t *src) {
dst->start_address = src->start_address;
dst->fmt = src->fmt;
dst->shape = src->shape;
dst->stride = src->stride;
dst->int8_rnd_mode = src->int8_rnd_mode;
}
static inline void bmk1880v2_tensor_lmem_s_copy_bf16_8(cvk_context_t *ctx, cvk_tl_t *dst,
cvk_tl_t *src, cvk_fmt_t fmt) {
assert(src->fmt == CVK_FMT_BF16 && (fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8) &&
"only support bf16->i8/uint8_t, plz check fmt\n");
dst->start_address = src->start_address;
dst->fmt = fmt;
dst->shape = src->shape;
dst->shape.w *= 2;
dst->stride = ctx->ops->tl_default_stride(ctx, dst->shape, fmt, CTRL_NULL);
// dst->shape.h *= 2;
// dst->stride = ctx->ops->tl_default_stride(ctx, dst->shape,
// /*eu_align*/ 1,
// fmt);
// dst->shape.h = src->shape.h;
dst->int8_rnd_mode = src->int8_rnd_mode;
}
// l2l means we keep the same shape between bf16/(u)int8
static inline void bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(cvk_context_t *ctx, cvk_tl_t *dst,
cvk_tl_t *src, cvk_fmt_t fmt) {
assert(src->fmt == CVK_FMT_BF16 && (fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8) &&
"only support bf16->i8/uint8_t, plz check fmt\n");
dst->start_address = src->start_address;
dst->fmt = fmt;
dst->shape = src->shape;
dst->stride = ctx->ops->tl_default_stride(ctx, dst->shape, fmt, CTRL_NULL);
dst->int8_rnd_mode = src->int8_rnd_mode;
}
int cvm_emit_square(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *OUT tl_ofmap_bf16,
cvk_fmt_t fmt);
void cvm_table_check(cvk_tl_t *IN tl_ifmap, cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa,
cvk_tl_t *OUT tl_ofmap_bf16);
int cvm_lut_exp_mantissa(cvk_context_t *ctx, cvk_tl_t *IN tl_ifmap, cvk_tl_t *IN tl_buf,
cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa,
cvk_tl_t *OUT tl_ofmap_bf16);
void cvm_get_uint8_t_tbl_idx(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *OUT tl_ofmap_bf16);
void cvm_get_dec(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *OUT tl_ofmap_bf16);
void cvm_get_dec_fractions(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *OUT buf,
cvk_tl_t *OUT tl_ofmap_bf16);
int cvm_emit_abs(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *OUT tl_ofmap_bf16,
cvk_fmt_t fmt);
int _cvm_lut_exp_mantissa(cvk_context_t *ctx, cvk_tl_t *IN tl_ifmap, cvk_tl_t *IN tl_buf,
cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa,
cvk_tl_t *OUT tl_ofmap_bf16, bool is_dirty_ifmap);
int _cvm_atan_fast_emit(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2,
cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf,
cvk_tl_t *tl_table_answer, cvk_tl_t *tl_table_answer_mantissa,
cvk_tl_t *OUT tl_ofmap_bf16, cvk_fmt_t fmt, float b, bool is_dirty_ifmap);
int cvm_emit_x_over_y(cvk_context_t *ctx, cvk_tl_t *IN x, cvk_tl_t *IN y, cvk_tl_t *IN tl_buf,
cvk_tl_t *OUT tl_ofmap_bf16, cvk_tl_t *tl_table_answer,
cvk_tl_t *tl_table_answer_mantissa, cvk_fmt_t fmt, bool is_dirty_ifmap);
int _cvm_emit_mask(cvk_context_t *ctx, cvk_tl_t *IN tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2,
cvk_tl_t *tl_buf3, cvk_tl_t *tl_pos_neg_table, cvk_tl_t *tl_0_idx_table,
cvk_tl_t *OUT tl_ofmap_bf16, cvk_fmt_t fmt, enum CVM_MASK_TYPE mask,
bool is_dirty_ifmap);
void _cvm_get_tbl_idx(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *OUT tl_ofmap_bf16,
cvk_fmt_t src_fmt, int int8_rnd_mode);
int __cvm_atan_fast_emit(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tl_buf2, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_invert_buf,
cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer,
cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *OUT tl_ofmap_bf16,
cvk_fmt_t fmt);
// not need to export to user
// mask please refer \CVM_MASK_TYPE for supported case
int cvm_emit_mask_gt0(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_pos_neg_buf,
cvk_tl_t *tl_0_idx_buf, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
int cvm_emit_mask_ge0(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tl_pos_neg_table, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
int cvm_emit_mask_le0(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tl_pos_neg_table, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
int cvm_emit_mask_eq0(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tl_0_idx_table, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
int cvm_emit_mask_lt0(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tl_pos_neg_table, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
int _cvm_atan_emit(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2,
cvk_tl_t *tl_buf3, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_slope_buf,
cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer,
cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt,
float b);
void cvm_emit_mask_ge0_lt0(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *index_i8,
cvk_tl_t *tl_buf3, cvk_fmt_t fmt);
void cvm_emit_mask_eq_0(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *tl_buf, cvk_tl_t *index_i8,
cvk_tl_t *tl_buf3, cvk_fmt_t fmt);
int cvm_lut_exp_mantissa(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa,
cvk_tl_t *tl_ofmap_bf16);
int cvm_emit_pythagoras(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf,
cvk_tl_t *tl_buf2, cvk_tl_t *tl_sqrt_table_answer,
cvk_tl_t *tl_sqrt_table_answer_mantissa, cvk_tl_t *tl_ofmap_bf16,
cvk_fmt_t fmt);
int cvm_emit_max_const(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ofmap_bf16,
cvk_fmt_t fmt, float b);
int cvm_emit_min_const(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ofmap_bf16,
cvk_fmt_t fmt, float b);
int cvm_emit_0_1_revert(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tbl_answer, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
int cvm_emit_mul(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
int cvm_emit_add(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
int cvm_emit_add_const(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ofmap_bf16,
cvk_fmt_t fmt, float b);
int cvm_emit_mul_const(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ofmap_bf16,
cvk_fmt_t fmt, float b);
// not release yet
void cvm_atan2_emit(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf,
cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_buf4, cvk_tl_t *tl_buf5,
cvk_tl_t *tl_buf6, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_slope_buf,
cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer,
cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_sqrt_table_answer,
cvk_tl_t *tl_sqrt_table_answer_mantissa, cvk_tl_t *tl_0_idx_table,
cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
int cvm_atan_slope_multipilier(cvk_context_t *cvk_ctx, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2,
cvk_tl_t *tl_buf3, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ofmap_bf16,
cvk_fmt_t fmt);
int cvm_atan_fast_emit(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
cvk_tl_t *tl_buf2, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_invert_buf,
cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer,
cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt,
bool is_dirty_ifmap);
void cvm_atan2_fast_emit(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf,
cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_buf4,
cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_slope_buf, cvk_tl_t *tl_invert_buf,
cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer,
cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_0_idx_table,
cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
// conv used
int cvm_reshape_channel_same_pad(
cvk_context_t *cvk_ctx, int ic, int ih, int iw, int kh, int kw, int pad_right, int pad_left,
int stride_h, int stride_w, cvk_tl_shape_t *tl_load_shape, cvk_tl_stride_t *new_tl_ifmap_stride,
cvk_tg_shape_t *new_tg_ifmap_shape, cvk_tg_stride_t *new_tg_ifmap_stride,
cvk_tl_shape_t *new_tl_weight_shape, cvk_tl_shape_t *new_tl_bias_shape,
cvk_tl_shape_t *new_tl_ofmap_shape, cvk_fmt_t fmt, int eu_align);
#endif /* GEN_LUT_1880v2_H */

File diff suppressed because it is too large Load Diff

1106
cvimath/src/tiu_lut_atan.c Normal file

File diff suppressed because it is too large Load Diff

787
cvimath/src/tiu_lut_atan2.c Normal file
View File

@ -0,0 +1,787 @@
/**
* \brirf implement with atan, plz refer https://en.wikipedia.org/wiki/Atan2
* NOTICE: current epsilon set to 0.1
*/
#include <cvimath_internal.h>
#include "gen_lut.h" // NOLINT
//#define DBG
static void _cvm_atan2_emit_case_3(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf,
cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_buf4,
cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_slope_buf,
cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table,
cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa,
cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, float b) {
// case 3
// atan( y / x)
// x0 = reciprocal(x)
cvm_emit_reciprocal(ctx, x, tl_buf2, tl_table_answer, tl_table_answer_mantissa, tl_buf);
// y0 = x0 * y
cvk_tiu_mul_param_t p1;
p1.res_high = NULL;
p1.res_low = tl_buf4;
p1.a = y;
p1.b_is_const = 0;
p1.b = tl_buf;
p1.rshift_bits = 0;
p1.relu_enable = 0;
ctx->ops->tiu_mul(ctx, &p1);
// x0 = atan(y0)
_cvm_atan_emit(ctx, tl_buf4, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, tl_slope_buf, tl_invert_buf,
tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa, OUT tl_ofmap_bf16,
fmt, b);
}
static void cvm_atan2_emit_case_3(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf,
cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_buf4,
cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_slope_buf,
cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table,
cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa,
cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
_cvm_atan2_emit_case_3(ctx, y, x, tl_buf, tl_buf2, tl_buf3, tl_buf4, tl_y0_buf, tl_slope_buf,
tl_invert_buf, tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa,
tl_ofmap_bf16, fmt, 0.0);
}
// NOTICE: it could dirty \y
/**
* atan2(y, x) should express 4 condition using atan express from
* [here](https://en.wikipedia.org/wiki/Atan2)
*/
void cvm_atan2_emit(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf,
cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_buf4, cvk_tl_t* tl_buf5,
cvk_tl_t* tl_buf6, cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_slope_buf,
cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table, cvk_tl_t* tl_table_answer,
cvk_tl_t* tl_table_answer_mantissa, cvk_tl_t* tl_sqrt_table_answer,
cvk_tl_t* tl_sqrt_table_answer_mantissa, cvk_tl_t* tl_0_idx_table,
cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
cvm_table_check(y, tl_y0_buf, tl_slope_buf, x);
cvm_table_check(tl_buf, tl_invert_buf, tl_pos_neg_table, tl_buf2);
cvm_table_check(tl_buf3, tl_table_answer, tl_table_answer_mantissa, tl_buf4);
cvm_table_check(tl_buf6, tl_table_answer, tl_0_idx_table, tl_buf5);
cvm_table_check(y, tl_sqrt_table_answer, tl_sqrt_table_answer_mantissa, x);
// atan(y/x), x > 0
// atan(y/x) + PI , x < 0 and y >= 0
// atan(y/x) - PI , x < 0 and y < 0
// pi / 2, x = 0 and y > 0
// -pi / 2, x = 0 and y < 0
// 0, x = 0 and y = 0
// atan(y/x), x > 0
cvm_emit_max_const(ctx, x, tl_buf4, fmt, 0.0);
cvm_atan2_emit_case_3(ctx, y, tl_buf4, tl_buf, tl_buf2, tl_buf3, tl_buf5, tl_y0_buf, tl_slope_buf,
tl_invert_buf, tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa,
tl_ofmap_bf16, fmt);
// x > 0
cvm_emit_mask_gt0(ctx, x, tl_buf, tl_buf3, tl_buf4, tl_pos_neg_table, tl_0_idx_table, tl_buf2,
fmt);
cvm_emit_mul(ctx, tl_ofmap_bf16, tl_buf2, tl_ofmap_bf16, fmt);
// atan(y/x) + PI , x < 0 and y >= 0
cvm_emit_min_const(ctx, x, tl_buf4, fmt, 0.0);
_cvm_atan2_emit_case_3(ctx, y, tl_buf4, tl_buf, tl_buf2, tl_buf3, tl_buf5, tl_y0_buf,
tl_slope_buf, tl_invert_buf, tl_pos_neg_table, tl_table_answer,
tl_table_answer_mantissa, tl_buf6, fmt, M_PI);
// cvm_emit_add_const(ctx, tl_buf6, tl_buf6, fmt, M_PI);
// get index map that x < 0 and y >= 0
// !(y >= 0) = !(y < 0)
#if 0
cvm_emit_pos_idx(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt);
// y == 0
cvm_emit_0_idx(ctx, y, tl_buf, tl_0_idx_table, tl_buf2, fmt);
cvm_emit_add(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
#else
// y >= 0
cvm_emit_mask_ge0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf2, fmt);
#endif
// x < 0
cvm_emit_mask_lt0(ctx, x, tl_buf, tl_pos_neg_table, tl_buf3, fmt);
// x < 0 && y >= 0
cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
cvm_emit_mul(ctx, tl_buf6, tl_buf2, tl_buf, fmt);
cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
// atan(y/x) - PI , x < 0 and y < 0
cvm_emit_min_const(ctx, x, tl_buf4, fmt, 0.0);
cvm_atan2_emit_case_3(ctx, y, tl_buf4, tl_buf, tl_buf2, tl_buf3, tl_buf5, tl_y0_buf, tl_slope_buf,
tl_invert_buf, tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa,
tl_buf6, fmt);
cvm_emit_add_const(ctx, tl_buf6, tl_buf6, fmt, -1.0 * M_PI);
// x < 0 and y < 0
// we leverage x <= 0 and y <= 0 cuz we filter out x = 0 case, speed up it
// x < 0
cvm_emit_mask_lt0(ctx, x, tl_buf, tl_pos_neg_table, tl_buf2, fmt);
// y < 0
cvm_emit_mask_lt0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt);
// x < 0 && y < 0
cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
cvm_emit_mul(ctx, tl_buf6, tl_buf2, tl_buf, fmt);
cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
// pi / 2, x = 0 and y > 0
// x = 0
cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1
// y > 0
cvm_emit_mask_gt0(ctx, y, tl_buf, tl_buf5, tl_buf4, tl_pos_neg_table, tl_0_idx_table, tl_buf3,
fmt);
// x = 0 && y > 0
cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, fmt, M_PI / 2.0);
cvm_emit_add(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
// -pi / 2, x = 0 and y < 0
// x = 0
cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1
// y < 0
cvm_emit_mask_lt0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt);
// x = 0 && y < 0
cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, fmt, -1.0 * M_PI / 2.0);
cvm_emit_add(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
// 0, x = 0 and y = 0
// x = 0
cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1
// y = 0
cvm_emit_mask_eq0(ctx, y, tl_buf, tl_0_idx_table, tl_buf3, fmt); // 0.003 could consider 1
// x = 0 && y = 0
cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf3, fmt);
// !(x = 0 and y = 0) keep it
cvm_emit_0_1_revert_input(ctx, tl_buf3, tl_buf, tl_buf2, fmt);
cvm_emit_mul(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
}
// ==== fast version ===
static void __cvm_atan2_fast_emit_case_3(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x,
cvk_tl_t* tl_buf, cvk_tl_t* tl_y0_buf,
cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table,
cvk_tl_t* tl_table_answer,
cvk_tl_t* tl_table_answer_mantissa,
cvk_tl_t* OUT tl_ofmap_bf16, cvk_tl_t* OUT y_over_x,
cvk_fmt_t fmt, float b) {
// case 3
// atan( y / x)
#if 0
// x0 = reciprocal(x)
_cvm_lut_exp_mantissa(ctx,
x,
NULL,
tl_table_answer,
tl_table_answer_mantissa,
tl_buf,
true
);
// y0 = x0 * y
cvm_emit_mul(ctx, y, tl_buf, tl_buf, fmt);
#else
cvm_emit_x_over_y(ctx, y, x, NULL, tl_buf, tl_table_answer, tl_table_answer_mantissa, fmt, true);
if (y_over_x) {
cvm_emit_add_const(ctx, tl_buf, y_over_x, fmt, 0);
}
#endif
// x0 = atan(y0)
_cvm_atan_fast_emit(ctx, tl_buf, x, NULL, tl_y0_buf, tl_invert_buf, tl_pos_neg_table,
tl_table_answer, tl_table_answer_mantissa, OUT tl_ofmap_bf16, fmt, b, true);
}
#if 0
static void _cvm_atan2_fast_emit(cvk_context_t *ctx,
cvk_tl_t* y,
cvk_tl_t* tl_buf,
cvk_tl_t* tl_buf2,
cvk_tl_t* tl_buf4,
cvk_tl_t* tl_y0_buf,
cvk_tl_t* tl_invert_buf,
cvk_tl_t* tl_pos_neg_table,
cvk_tl_t* tl_table_answer,
cvk_tl_t* tl_table_answer_mantissa,
cvk_tl_t* OUT tl_ofmap_bf16,
cvk_tl_t* OUT tl_buf3,
cvk_fmt_t fmt) {
// case 3
// atan( y / x)
#if 0
// x0 = reciprocal(tl_buf)
_cvm_lut_exp_mantissa(ctx,
tl_buf,
NULL,
tl_table_answer,
tl_table_answer_mantissa,
tl_buf2,
true
);
// y0 = x0 * y
cvm_emit_mul(ctx, y, tl_buf2, tl_buf2, fmt);
#else
#if 0
cvm_emit_x_over_y(ctx, y, tl_buf, NULL, tl_buf2,
tl_table_answer, tl_table_answer_mantissa, fmt, true);
if (tl_buf3) {
bf16_emit_add_const(ctx, tl_buf2, tl_buf3, fmt, 0);
}
#else
//if (tl_buf3) {
// cvm_emit_add_const(ctx, tl_buf, tl_buf3, fmt, 0);
//}
// get xy == 0 and y < 0, add pi
// using xy to depend x = 0 or y = 0
// recipical y < 0 get 0xFEFF, y > 0 get 0x7F7F,
// 1. b = xy to get other/(x = 0 or y = 0)
// 2. c = b * 2^64 to saturate it
// 3. c(bf16) = c(int8) >> 10 to get 1/0 map, 1 indicate xy > 0
// 4. c = c * -1 + 1 to invert map, 1 indicate x = 0 or y = 0
// 5. d = b(int8) - 0x7f, 0 means y > 0
// 6. d = d(int8) + 0xff to get inf
cvm_emit_mul(ctx, y, tl_buf, tl_buf2, fmt);
// get 7f7f / 0
cvm_emit_mul_const(ctx, tl_buf2, tl_ofmap_bf16, fmt, convert_bf16_fp32(0x7f00));
//// 1 = 0x3f80
//bf16_emit_mul_const(ctx, tl_buf2, tl_ofmap_bf16, fmt, 0);
//bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_buf4, fmt, 1.0);
// bf16->uint8_t and back uint8_t->bf16 to get 0/1 map
#if 1
cvk_tl_t index_uint8_t;
bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &index_uint8_t, tl_buf2, CVK_FMT_U8);
index_uint8_t.shape.w = index_uint8_t.shape.w / 2;
index_uint8_t.stride = ctx->ops->tl_default_stride(ctx, index_uint8_t.shape,
CTRL_NULL, CVK_FMT_I8);
index_uint8_t.fmt = CVK_FMT_I8;
cvk_tdma_l2l_tensor_copy_param_t p1;
p1.src = tl_ofmap_bf16;
p1.dst = &index_uint8_t;
ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
cvk_tiu_mul_param_t p;
#if 0
p.res_high = NULL;
p.res_low = &index_uint8_t;
p.a = &index_uint8_t;
p.b_is_const = 1;
p.b_const.val =-1;
p.b_const.is_signed = 1;
p.rshift_bits = 0;
p.relu_enable = 0;
ctx->ops->tiu_mul(ctx, &p);
#else
p.res_high = NULL;
p.res_low = &index_uint8_t;
p.a = &index_uint8_t;
p.b_is_const = 1;
p.b_const.val =-1;
p.b_const.is_signed = 1;
p.rshift_bits = 7;
p.relu_enable = 0;
ctx->ops->tiu_mul(ctx, &p);
#endif
// get -1/0 map, -1 indicate xy != 0
p1.src = &index_uint8_t;
p1.dst = tl_ofmap_bf16;
ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
// x * (-1) + 1 get 0/1 map, 1 indicate xy == 0
//bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, -1.0);
cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0);
// get y < 0, bf16->int8 and mul 0xff to get -128 and righshift to get 1
cvm_emit_mul_const(ctx, y, tl_buf3, fmt, pow(2,64));
p1.src = tl_buf3;
p1.dst = &index_uint8_t;
ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
p.res_high = 0;
p.res_low = &index_uint8_t;
p.a = &index_uint8_t;
p.b_is_const = 1;
p.b_const.val =-128;
p.b_const.is_signed = 1;
p.rshift_bits = 0;
p.relu_enable = 1;
ctx->ops->tiu_mul(ctx, &p);
p.res_high = 0;
p.res_low = &index_uint8_t;
p.a = &index_uint8_t;
p.b_is_const = 1;
p.b_const.val =1;
p.b_const.is_signed = 1;
p.rshift_bits = 7;
p.relu_enable = 1;
ctx->ops->tiu_mul(ctx, &p);
// get y < 0
p1.src = &index_uint8_t;
p1.dst = tl_buf4;
ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
cvm_emit_mul_const(ctx, tl_buf4, tl_buf4, fmt, -1.0);
// get y > 0
// y * (-1) + 1 get 0/1 map, 1 indicate xy == 0
cvm_emit_add_const(ctx, tl_buf4, tl_buf2, fmt, 1.0);
cvm_emit_add(ctx, tl_buf2, tl_buf4, tl_buf2, fmt);
// merge y > 0 && y < 0 && x == 0
cvm_emit_mul(ctx, tl_ofmap_bf16, tl_buf2, tl_buf3, fmt);
//bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 0);
//bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_buf3, fmt, M_PI);
#endif
cvm_emit_x_over_y(ctx, y, tl_buf, NULL, tl_buf2,
tl_table_answer, tl_table_answer_mantissa, fmt, true);
#endif
#endif
// x0 = atan(y0)
__cvm_atan_fast_emit(ctx,
tl_buf2,
tl_buf,
tl_buf4,
tl_y0_buf,
tl_invert_buf,
tl_pos_neg_table,
tl_table_answer,
tl_table_answer_mantissa,
OUT tl_ofmap_bf16,
fmt);
// abs tl_buf3
// revert and mul to clean !(x == 0 && (y != 0) case
// add pi/2
cvm_emit_mul_const(ctx, tl_buf3, tl_buf2, fmt, -1);
cvk_tiu_min_param_t p3;
p3.min = tl_buf2;
p3.a = tl_buf3;
p3.b_is_const = 0;
p3.b = tl_buf2;
ctx->ops->tiu_min(ctx, &p3);
cvm_emit_add_const(ctx, tl_buf2, tl_buf2, fmt, 1.0);
cvm_emit_mul(ctx, tl_ofmap_bf16, tl_buf2, tl_ofmap_bf16, fmt);
cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, M_PI_2);
cvm_emit_add(ctx, tl_buf3, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
}
#endif
static void _cvm_atan2_fast_emit_case_3(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x,
cvk_tl_t* tl_buf, cvk_tl_t* tl_y0_buf,
cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table,
cvk_tl_t* tl_table_answer,
cvk_tl_t* tl_table_answer_mantissa,
cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, float b) {
// case 3
// atan( y / x)
return __cvm_atan2_fast_emit_case_3(ctx, y, x, tl_buf, tl_y0_buf, tl_invert_buf, tl_pos_neg_table,
tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16,
NULL, fmt, b);
}
void cvm_atan2_fast_emit(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf,
cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_buf4,
cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_slope_buf, cvk_tl_t* tl_invert_buf,
cvk_tl_t* tl_pos_neg_table, cvk_tl_t* tl_table_answer,
cvk_tl_t* tl_table_answer_mantissa, cvk_tl_t* tl_0_idx_table,
cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
cvm_table_check(y, tl_y0_buf, tl_slope_buf, x);
cvm_table_check(tl_buf, tl_invert_buf, tl_pos_neg_table, tl_buf2);
cvm_table_check(tl_buf3, tl_table_answer, tl_table_answer_mantissa, tl_buf4);
cvm_table_check(tl_buf4, tl_table_answer, tl_0_idx_table, tl_buf4);
// atan(y/x), x > 0
// atan(y/x) + PI , x < 0 and y >= 0
// atan(y/x) - PI , x < 0 and y < 0
// pi / 2, x = 0 and y > 0
// -pi / 2, x = 0 and y < 0
// 0, x = 0 and y = 0
// atan(y/x), x > 0
cvm_emit_max_const(ctx, x, tl_buf, fmt, 0.0);
_cvm_atan2_fast_emit_case_3(ctx, y, tl_buf, tl_buf2, tl_y0_buf, tl_invert_buf, tl_pos_neg_table,
tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16, fmt, 0.0);
// x > 0
cvm_emit_mask_gt0(ctx, x, tl_buf, tl_buf2, tl_buf3, tl_pos_neg_table, tl_0_idx_table, tl_buf,
fmt);
cvm_emit_mul(ctx, tl_ofmap_bf16, tl_buf, tl_ofmap_bf16, fmt);
// atan(y/x) + PI , x < 0 and y >= 0
cvm_emit_min_const(ctx, x, tl_buf, fmt, 0.0);
_cvm_atan2_fast_emit_case_3(ctx, y, tl_buf, tl_buf2, tl_y0_buf, tl_invert_buf, tl_pos_neg_table,
tl_table_answer, tl_table_answer_mantissa, tl_buf4, fmt, M_PI);
// cvm_emit_add_const(ctx, tl_buf4, tl_buf4, fmt, M_PI);
// get index map that x < 0 and y >= 0
// !(y >= 0) = !(y < 0)
#if 0
cvm_emit_pos_idx(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt);
// y == 0
cvm_emit_0_idx(ctx, y, tl_buf, tl_0_idx_table, tl_buf2, fmt);
cvm_emit_add(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
#else
// y >= 0
cvm_emit_mask_ge0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf2, fmt);
#endif
// x < 0
cvm_emit_mask_lt0(ctx, x, tl_buf, tl_pos_neg_table, tl_buf3, fmt);
// x < 0 && y >= 0
cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
cvm_emit_mul(ctx, tl_buf4, tl_buf2, tl_buf, fmt);
cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
// atan(y/x) - PI , x < 0 and y < 0
cvm_emit_min_const(ctx, x, tl_buf, fmt, 0.0);
_cvm_atan2_fast_emit_case_3(ctx, y, tl_buf, tl_buf2, tl_y0_buf, tl_invert_buf, tl_pos_neg_table,
tl_table_answer, tl_table_answer_mantissa, tl_buf4, fmt, 0.0);
cvm_emit_add_const(ctx, tl_buf4, tl_buf4, fmt, -1.0 * M_PI);
// x < 0 and y < 0
// we leverage x <= 0 and y <= 0 cuz we filter out x = 0 case, speed up it
// x < 0
cvm_emit_mask_lt0(ctx, x, tl_buf, tl_pos_neg_table, tl_buf2, fmt);
// y < 0
cvm_emit_mask_lt0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt);
// x < 0 && y < 0
cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
cvm_emit_mul(ctx, tl_buf4, tl_buf2, tl_buf, fmt);
cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
// pi / 2, x = 0 and y > 0
// x = 0
cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1
// y > 0
// cvm_emit_mask_gt0(ctx, y, tl_buf, tl_buf5, tl_buf4, tl_pos_neg_table, tl_0_idx_table, tl_buf3,
// fmt);
_cvm_emit_mask(ctx, y, tl_buf, tl_buf4, NULL, tl_pos_neg_table, tl_0_idx_table, tl_buf3, fmt,
CVM_MASK_TYPE_GT_0, true);
// x = 0 && y > 0
cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, fmt, M_PI / 2.0);
cvm_emit_add(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
// -pi / 2, x = 0 and y < 0
// x = 0
cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1
// y < 0
cvm_emit_mask_lt0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt);
// x = 0 && y < 0
cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, fmt, -1.0 * M_PI / 2.0);
cvm_emit_add(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
// 0, x = 0 and y = 0
// x = 0
cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1
// y = 0
cvm_emit_mask_eq0(ctx, y, tl_buf, tl_0_idx_table, tl_buf3, fmt); // 0.003 could consider 1
// x = 0 && y = 0
cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf3, fmt);
// !(x = 0 and y = 0) keep it
cvm_emit_0_1_revert_input(ctx, tl_buf3, tl_buf, tl_buf2, fmt);
cvm_emit_mul(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
}
static void _x_lt_0(cvk_context_t* ctx, cvk_tl_t* x, cvk_tl_t* tl_buf, cvk_tl_t* index_i8,
cvk_fmt_t fmt, cvk_tl_t* OUT tl_buf2) {
cvk_tiu_min_param_t p7;
cvk_tiu_mul_param_t p;
cvk_tdma_l2l_tensor_copy_param_t p1;
// x < 0
p7.min = tl_buf;
p7.a = x;
p7.b_is_const = 1;
p7.b_const.val = 0;
p7.b_const.is_signed = 1;
ctx->ops->tiu_min(ctx, &p7);
cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, pow(2, 64));
p1.src = tl_buf;
p1.dst = index_i8;
ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
p.res_high = 0;
p.res_low = index_i8;
p.a = index_i8;
p.b_is_const = 1;
p.b_const.val = -128;
p.b_const.is_signed = 1;
p.rshift_bits = 0;
p.relu_enable = 1;
ctx->ops->tiu_mul(ctx, &p);
p.res_high = 0;
p.res_low = index_i8;
p.a = index_i8;
p.b_is_const = 1;
p.b_const.val = 1;
p.b_const.is_signed = 1;
p.rshift_bits = 7;
p.relu_enable = 1;
ctx->ops->tiu_mul(ctx, &p);
// get x < 0
p1.src = index_i8;
p1.dst = tl_buf2;
ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
}
static void _cvm_atan2_merge_emit(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf,
cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_y0_buf,
cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table,
cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa,
cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, float degree_factor) {
cvm_table_check(y, tl_y0_buf, tl_invert_buf, x);
cvm_table_check(tl_buf, tl_invert_buf, tl_pos_neg_table, tl_buf2);
cvm_table_check(tl_buf3, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16);
cvk_tl_t index_i8;
bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_buf2, CVK_FMT_I8);
/**
* step 1. atan(y/x)
*/
cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, 0.0);
cvm_emit_add(ctx, x, tl_buf, tl_buf, fmt);
#if 0
// get y < 0, bf16->int8 and mul 0xff to get -128 and righshift to get 1
cvk_tiu_mul_param_t p;
cvk_tdma_l2l_tensor_copy_param_t p1;
cvm_emit_mul_const(ctx, y, tl_buf3, fmt, pow(2,64));
p1.src = tl_buf3;
p1.dst = &index_i8;
ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
p.res_high = 0;
p.res_low = &index_i8;
p.a = &index_i8;
p.b_is_const = 1;
p.b_const.val =-128;
p.b_const.is_signed = 1;
p.rshift_bits = 0;
p.relu_enable = 1;
ctx->ops->tiu_mul(ctx, &p);
p.res_high = 0;
p.res_low = &index_i8;
p.a = &index_i8;
p.b_is_const = 1;
p.b_const.val =1;
p.b_const.is_signed = 1;
p.rshift_bits = 7;
p.relu_enable = 1;
ctx->ops->tiu_mul(ctx, &p);
// get y < 0
p1.src = &index_i8;
p1.dst = tl_buf3;
ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -1.0);
// get y > 0
// y * (-1) + 1 get 0/1 map, 1 indicate xy == 0
cvm_emit_add_const(ctx, tl_buf3, tl_buf2, fmt, 1.0);
// reduce y == 0
if (0)
{
cvk_tiu_max_param_t p3;
cvk_tl_t index_i8;
bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_ofmap_bf16, CVK_FMT_I8);
cvm_emit_mul_const(ctx, y, tl_buf, fmt, -1);
p3.max = tl_buf;
p3.a = y;
p3.b_is_const = 0;
p3.b = tl_buf;
ctx->ops->tiu_max(ctx, &p3);
cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, convert_bf16_fp32(0x7f00));
//bf16_emit_mul_const(ctx, tl_buf, tl_buf, fmt, pow(2, 64));
p1.src = tl_buf;
p1.dst = &index_i8;
ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
p.res_high = NULL;
p.res_low = &index_i8;
p.a = &index_i8;
p.b_is_const = 1;
p.b_const.val =-1;
p.b_const.is_signed = 1;
p.rshift_bits = 7;
p.relu_enable = 0;
ctx->ops->tiu_mul(ctx, &p);
p1.src = &index_i8;
p1.dst = tl_buf3;
ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
//revert it
cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -1.0);
//bf16_emit_add_const(ctx, tl_buf3, tl_buf3, fmt, 1);
cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
}
cvm_emit_add(ctx, tl_buf2, tl_buf3, tl_buf3, fmt);
#endif
cvm_emit_x_over_y(ctx, y, tl_buf, NULL, tl_buf2, tl_table_answer, tl_table_answer_mantissa, fmt,
true);
// x0 = atan(y0)
__cvm_atan_fast_emit(ctx, tl_buf2, tl_buf, tl_buf3, tl_y0_buf, tl_invert_buf, tl_pos_neg_table,
tl_table_answer, tl_table_answer_mantissa, OUT tl_ofmap_bf16, fmt);
bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_buf, CVK_FMT_I8);
// seperate y >= 0 or < 0 to handle 0 degree / 180 degree
cvm_emit_mask_ge0_lt0(ctx, y, &index_i8, tl_buf3, fmt);
/**
* step 2. set x == 0, y >=0 to pi/2, y < 0 to -pi/2
* FIXME: atan(0) not eq PI/2
*/
// x = 0 and y != 0
// reset all x = 0
// y >= 0 as pi/2, y < 0 as -pi/2
// merge
cvm_emit_mask_eq_0(ctx, x, tl_buf, &index_i8, tl_buf2, fmt);
// clear x = 0
cvm_emit_mul_const(ctx, tl_buf2, tl_buf, fmt, -1);
cvm_emit_mul(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
// get revert map, x = -x + 1 cuz original -1 menas x != 0
cvm_emit_mul_const(ctx, tl_buf3, tl_buf, fmt, M_PI_2 * degree_factor);
cvm_emit_add_const(ctx, tl_buf2, tl_buf2, fmt, 1);
cvm_emit_mul(ctx, tl_buf, tl_buf2, tl_buf, fmt);
cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
// return;
/**
* step 3. handle x < 0 && y != 0
*/
// x < 0
_x_lt_0(ctx, x, tl_buf, &index_i8, fmt, tl_buf2);
// x < 0 && (y >= 1 && y < 1)
cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf, fmt);
cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, M_PI * degree_factor);
cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
/**
* 4. handle x != 0 && y == 0, x>0: 0, x<0: PI, tpu atan default all pi/2
*/
// tl_buf2 as x < 0
// get y == 0, tl_buf3 keep y>=0 is 1, y<1 = -1
cvm_emit_mask_eq_0(ctx, y, tl_buf, &index_i8, tl_buf3, fmt);
// revert
cvm_emit_mul_const(ctx, tl_buf3, tl_buf, fmt, -1.0);
// reset y = 0 x = ? as 0, other case leave to step 5
cvm_emit_mul(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
/**
* 5. set y == 0 and x < 0 as pi
*/
// get y == 0
cvm_emit_add_const(ctx, tl_buf3, tl_buf, fmt, 1.0);
// y == 0 && x < 0
cvm_emit_mul(ctx, tl_buf, tl_buf2, tl_buf, fmt);
cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, M_PI * degree_factor);
// merge
cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
return;
}
/**
* \brief reduce lut table with following step
* 1. atan(y/x)
* 2. handle x = 0 && y != 0, directly set pi/2, -pi/2
* 3. handle x < 0 && y != 0
* => y>0: PI/2, y <0: -PI/2, tpu atan default y>0: -PI/2, y <0: PI/2
* 4. handle x != 0 && y == 0, x>0: 0, x<0: PI, tpu atan default all pi/2
* 5. handle x = 0 && y = 0 => PI
*/
void cvm_atan2_merge_emit(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf,
cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_y0_buf,
cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table,
cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa,
cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
return _cvm_atan2_merge_emit(ctx, y, x, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, tl_invert_buf,
tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa,
tl_ofmap_bf16, fmt, 1.0);
}
void cvm_atan2_fast_degree_emit(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf,
cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_y0_buf,
cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table,
cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa,
cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
return _cvm_atan2_merge_emit(ctx, y, x, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, tl_invert_buf,
tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa,
tl_ofmap_bf16, fmt, 180 / M_PI);
}

View File

@ -0,0 +1,149 @@
/**
*/
#include <cvimath_internal.h>
#include "gen_lut.h" // NOLINT
//#define DBG
/*
* NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type
*
* \tl_buf tmp buffer, the shape MUST be same with \tl_ifmap
* \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used
*/
int cvm_emit_reciprocal(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_buf,
cvk_tl_t* tbl_answer, cvk_tl_t* tbl_answer_mantissa,
cvk_tl_t* OUT tl_ofmap_bf16) {
return cvm_lut_exp_mantissa(ctx, tl_ifmap, tl_buf, tbl_answer, tbl_answer_mantissa,
tl_ofmap_bf16);
}
// <! gen reciprocal f(x) = 1/x
static double _gen_reciprocal(int base, int p) {
// y = x ^ -1
double f = (double)(pow(base, -1 * p));
if (isnan(f)) {
ASSERT(0);
}
return f;
}
void cvm_gen_reciprocal(uint16_t* table_data, cvk_tl_shape_t* table_shape) {
ASSERT(is_1880v2_tbl_shape(table_shape));
int exp_start = cvm_exp_start();
int half = half_h_table();
int table_hw = cvm_table_hw();
uint64_t idx = 0;
// prepare channel 0
double s = 0.0;
// 0^-1 is invalid, use positive/negtive max value: 0x7F7F / 0xFF7F
// table_data[idx] = 0xff7f; //<! convert to 0xff7f, mulitply slope[0](0.5) is feff
table_data[idx] = 0x7F80; //<! convert to 0x7F7F
#ifdef DBG
printf("t [%lu] is %f bf %x\n", idx, convert_bf16_fp32(table_data[idx]), table_data[idx]);
#endif
idx++;
// > 0, exp from 0 -62 -61 .. 62 63
for (int i = 0; i < half - 1; i++) {
int shift = (exp_start + i);
bool is_odd = (shift % 2);
float exp = shift;
if (is_odd) {
exp = exp - 1;
}
double s = _gen_reciprocal(2, exp);
table_data[idx] = convert_fp32_bf16(s);
#ifdef DBG
printf("t [%lu] is %f [idx:%f][2^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]),
(float)(exp_start + i), -1 * exp, table_data[idx]);
#endif
idx++;
}
s = _gen_reciprocal(2, -0);
table_data[idx] = convert_fp32_bf16(s);
table_data[idx] = 0x7F80; //<! convert to 0x7F7F
#ifdef DBG
printf("t [%lu] is %f[%d] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), 0, table_data[idx]);
#endif
idx++;
// < 0, exp from 0 -62 -61 .. 62 63
for (int i = 0; i < half - 1; i++) {
int shift = (exp_start + i);
bool is_odd = (shift % 2);
float exp = shift;
if (is_odd) {
exp = exp - 1;
}
double s = -1 * _gen_reciprocal(-2, exp);
table_data[idx] = convert_fp32_bf16(s);
#ifdef DBG
printf("t [%lu] is %f(%e - %.8lf)[(-2)^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s,
s, exp, table_data[idx]);
#endif
idx++;
}
// idx = 255 dont care
// s = _gen_reciprocal(2, 0);
// table_data[idx] = convert_fp32_bf16(s);
// printf("t [%lu] is %f[%d]\n", idx, convert_bf16_fp32(table_data[idx]), 0);
// idx++;
// duplicate channel #1 to #31
// TODO: tensor copy
for (uint32_t i = 1; i < table_shape->c; i++) {
memcpy(&table_data[i * table_hw], &table_data[0], sizeof(uint16_t) * table_hw);
}
}
void cvm_gen_reciprocal_mantissa(uint16_t* OUT table_mantissa, cvk_tl_shape_t* table_shape) {
ASSERT(is_1880v2_tbl_shape(table_shape));
uint32_t half = half_h_table();
int table_hw = cvm_table_hw();
int idx = 0;
double d;
for (uint32_t i = 0; i < half; i++) {
d = 1 + i * 1 / 128.0;
d = (double)pow(d, -1);
table_mantissa[128 + idx] = convert_fp32_bf16(d);
// 13=2^3x1.625=(2^2)x(2^1x1.625)
d = 2 * (1 + i * 1 / 128.0);
d = (double)pow(d, -1);
table_mantissa[idx] = convert_fp32_bf16(d);
idx++;
}
#ifdef DBG
for (uint32_t i = 0; i < 2 * half; i++) {
printf("mantissa [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_mantissa[i]),
table_mantissa[i]);
}
#endif /* ifdef DBG */
// duplicate channel #1 to #31
// TODO: tensor copy
for (uint64_t i = 1; i < table_shape->c; i++) {
memcpy(&table_mantissa[table_hw * i], &table_mantissa[0], sizeof(uint16_t) * table_hw);
}
}
void cvm_reciprocal_tbl(uint16_t* table_data, uint16_t* table_mantissa,
cvk_tl_shape_t* table_shape) {
ASSERT(table_data);
ASSERT(table_mantissa);
ASSERT(table_shape);
cvm_gen_reciprocal(table_data, table_shape);
cvm_gen_reciprocal_mantissa(table_mantissa, table_shape);
}

387
cvimath/src/tiu_reshape_c.c Normal file
View File

@ -0,0 +1,387 @@
/**
* reshape channel under depthwise
*/
//
#include <cvimath_internal.h>
#include "gen_lut.h" // NOLINT
//#define DBG
// copy from \1880v2_test_util.h
static int calc_dilute_hw(int h, int ins_h, int ins_h_l, int pad_h_b, int pad_h_t) {
return (h - 1) * (ins_h + 1) + ins_h_l + 1 + pad_h_t + pad_h_b;
}
// get padding as 'SAME' mode in tensorflow
// https://www.jianshu.com/p/05c4f1621c7e
static int get_same_pad(int ih, int sh, int kh) {
return (((ih + sh - 1) / sh) - 1) * sh + kh - ih;
}
// get real 'h' with pad/ins
static int pooling_ih_ext(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int ih) {
int ins = ins_h;
int ins_last = ins_last_h;
int pad = pad_top + pad_bottom;
return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
}
// get real 'w' with pad/ins
static int pooling_iw_ext(int ins_w, int ins_last_w, int pad_left, int pad_right, int iw) {
int ins = ins_w;
int ins_last = ins_last_w;
int pad = pad_left + pad_right;
return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
}
// get output h with parameter
static int pooling_oh(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int stride_h, int ih,
int kh, int dh) {
int ih_ext = pooling_ih_ext(ins_h, ins_last_h, pad_top, pad_bottom, ih);
int d_h = (kh - 1) * dh + 1;
return (ih_ext - d_h) / stride_h + 1;
}
// get output w with parameter
static int pooling_ow(int ins_w, int ins_last_w, int pad_left, int pad_right, int stride_w, int iw,
int kw, int dw) {
int iw_ext = pooling_iw_ext(ins_w, ins_last_w, pad_left, pad_right, iw);
int d_w = (kw - 1) * dw + 1;
return (iw_ext - d_w) / stride_w + 1;
}
/**
* \brief get extended bias
* \return allocated new bias
*/
uint32_t* cvm_reshape_channel_bias(uint8_t* bias, int ni, int ci, int hi, int wi, int old_bias_c,
cvk_fmt_t fmt) {
ASSERT(bias);
ASSERT((ni == 2 || ni == 1) && "not support bias batch > 1");
ASSERT(ci / old_bias_c > 0 && ci % old_bias_c == 0);
int sz = fmt == CVK_FMT_BF16 ? 4 : 2;
int d_c_bias_sz = ni * ci * hi * wi;
uint8_t* new_bias = (uint8_t*)malloc(d_c_bias_sz * sz);
int bias_hw = hi * wi;
int duplicat_c = ci / old_bias_c;
for (int c = 0; c < old_bias_c; c++) {
int shift = (c * bias_hw) * sz;
for (int i = 0; i < duplicat_c; i++) {
int new_bias_shift = (c * duplicat_c + i) * bias_hw * sz;
memcpy(&new_bias[new_bias_shift], &bias[shift], bias_hw * sz);
}
}
return (uint32_t*)new_bias;
}
/*
* \brief prepare load shape/stride
* \return -1 means fail to reshape, 0 means success
* \TODO check memory usage
*/
static inline int _get_dup_shape(cvk_context_t* ctx, int in, int ic, int ih, int iw, int d_kh,
int stride_h, int npu_num, cvk_tl_shape_t* tl_shape,
cvk_tl_stride_t* tl_load_stride, cvk_tg_shape_t* tg_shape,
cvk_tg_stride_t* tg_stride, cvk_fmt_t src_tg_fmt,
cvk_fmt_t dst_tl_fmt) {
ASSERT(in > 0 && ic > 0 && ih > 0 && iw > 0 && d_kh > 0 && stride_h > 0);
ASSERT(tl_shape && tl_load_stride && tg_shape && tg_stride);
// 1. reshape and extend c, h axis in order
int ch = ic * ih;
int oc;
int oh;
// FIXME: check kernel setting
oh = 0;
for (int i = npu_num / ic; i > 0; i--) {
#if 0
int hw = ih * iw;
int _oh = hw / i / iw;
if (hw % i == 0 && (hw / i) % stride_h == 0 && _oh >= stride_h) {
oh = _oh;
break;
}
#else
int _oh = ih / i;
if (ih % i == 0 && (_oh) % stride_h == 0 && _oh >= stride_h /*&& _oh >= d_kh*/) {
oh = _oh;
break;
}
#endif
}
if (!oh) {
// FIXME: check terminal condition
return -1;
}
oc = ch / oh;
#ifdef DBG
printf("ic:ih is %d %d, oc:oh is %d:%d\n", ic, ih, oc, oh);
#endif
// tg/tl MUST be same shape size
tl_shape->n = tg_shape->n = 1;
tl_shape->c = tg_shape->c = oc;
tl_shape->h = tg_shape->h = oh;
tl_shape->w = tg_shape->w = iw;
// init tl
cvk_tl_stride_t s = ctx->ops->tl_default_stride(ctx, *tl_shape, dst_tl_fmt, CTRL_NULL);
tl_load_stride->n = s.n;
tl_load_stride->c = s.c;
tl_load_stride->h = s.h;
tl_load_stride->w = s.w;
// init tg
cvk_tg_stride_t gs = ctx->ops->tg_default_stride(ctx, *tg_shape, src_tg_fmt);
tg_stride->n = gs.n;
tg_stride->c = gs.c;
tg_stride->h = gs.h;
return 0;
}
/**
* \brief get proper reshape size for depthwise conv with 'same' mode in h direction
* \return -1 means alloc fail
* \NOTICE: not support batch/ins_x/dilated_x/pad_top/pad_bottom
*/
int cvm_reshape_channel_same(cvk_context_t* ctx, int ic, int ih, int iw, int kh, int kw,
int pad_right, int pad_left, int stride_h, int stride_w,
cvk_tl_shape_t* tl_load_shape, cvk_tl_stride_t* new_tl_ifmap_stride,
cvk_tg_shape_t* new_tg_ifmap_shape,
cvk_tg_stride_t* new_tg_ifmap_stride,
cvk_tl_shape_t* new_tl_weight_shape, cvk_tl_shape_t* new_tl_bias_shape,
cvk_tl_shape_t* new_tl_ofmap_shape, cvk_fmt_t fmt, int eu_align) {
ASSERT(eu_align == 0 || eu_align == 1);
cvk_chip_info_t info = ctx->info;
// TODO: verify dilation_h/dilation_w
int dilation_h = 1;
int dilation_w = 1;
// TODO: verify p->ins_h, p->ins_last_h
int d_kh = calc_dilute_hw(kh, dilation_h - 1, 0, 0, 0);
int h_after = calc_dilute_hw(ih, 0, 0, 0, 0);
int in = 1;
// int h_after = calc_dilute_hw(ih, p->ins_h, p->ins_last_h, p->pad_top, p->pad_bottom);
// int w_after = calc_dilute_hw(iw, p->ins_w, p->ins_last_w, p->pad_left, p->pad_right);
int ret = _get_dup_shape(ctx, in, ic, h_after, iw, d_kh, stride_h, info.npu_num, tl_load_shape,
new_tl_ifmap_stride, new_tg_ifmap_shape, new_tg_ifmap_stride, fmt, fmt);
if (ret == -1) {
return ret;
}
new_tl_weight_shape->n = 1;
new_tl_weight_shape->c = tl_load_shape->c;
new_tl_weight_shape->h = kh;
new_tl_weight_shape->w = kw;
new_tl_bias_shape->n = 2;
new_tl_bias_shape->c = tl_load_shape->c;
new_tl_bias_shape->h = 1;
new_tl_bias_shape->w = 1;
int pad_h = get_same_pad(tl_load_shape->h, stride_h, kh);
// int no_pad_h = tl_load_shape->h;
// reserve for padding
new_tg_ifmap_shape->h += pad_h;
tl_load_shape->h += pad_h;
cvk_tl_stride_t s = ctx->ops->tl_default_stride(ctx, *tl_load_shape, fmt, eu_align);
new_tl_ifmap_stride->n = s.n;
new_tl_ifmap_stride->c = s.c;
new_tl_ifmap_stride->h = s.h;
new_tl_ifmap_stride->w = s.w;
// TODO: verity ins_x
int oh = pooling_oh(0, 0, 0, 0, stride_h, tl_load_shape->h, kh, dilation_h);
int ow = pooling_ow(0, 0, pad_left, pad_right, stride_w, tl_load_shape->w, kw, dilation_w);
#ifdef DBG
printf("new oh/ow pad_h is %d/%d %d\n", oh, ow, pad_h);
#endif
new_tl_ofmap_shape->n = in;
new_tl_ofmap_shape->c = tl_load_shape->c;
new_tl_ofmap_shape->h = oh;
new_tl_ofmap_shape->w = ow;
return ret;
}
/*
* \brief duplicate weight for reshaped c
*/
uint8_t* cvm_reshape_channel_weight(uint8_t* weight, int ni, int ci, int hi, int wi,
int old_weight_c, cvk_fmt_t fmt) {
ASSERT(weight);
ASSERT(ci / old_weight_c > 0 && ci % old_weight_c == 0);
int sz = fmt == CVK_FMT_BF16 ? 2 : 1;
int new_weight_hw_shape_size = hi * wi;
int new_weight_shape_size = ni * ci * hi * wi;
int duplicat_c = ci / old_weight_c;
uint8_t* new_weight = (uint8_t*)malloc(new_weight_shape_size * sz);
for (int n = 0; n < ni; n++) {
for (int c = 0; c < old_weight_c; c++) {
int index = (n * old_weight_c + c) * new_weight_hw_shape_size * sz;
for (int i = 0; i < duplicat_c; i++) {
int new_weight_index =
(n * old_weight_c * duplicat_c + c * duplicat_c + i) * new_weight_hw_shape_size * sz;
memcpy(&new_weight[new_weight_index], &weight[index], new_weight_hw_shape_size * sz);
}
}
}
return new_weight;
}
/*
* \brief prepare load shape/stride with pad
* \return -1 means fail to reshape, 0 means success
* \TODO check memory usage
*/
static inline int _get_dup_shape_same_pad(cvk_context_t* ctx, int in, int ic, int ih, int iw,
int d_kh, int stride_h, int npu_num,
cvk_tl_shape_t* tl_load_shape,
cvk_tl_stride_t* tl_load_stride, cvk_tg_shape_t* tg_shape,
cvk_tg_stride_t* tg_stride, cvk_fmt_t src_tg_fmt,
cvk_fmt_t dst_tl_fmt) {
ASSERT(in > 0 && ic > 0 && ih > 0 && iw > 0 && d_kh > 0 && stride_h > 0);
ASSERT(tl_load_shape && tl_load_stride && tg_shape && tg_stride);
// 1. reshape and extend c, h axis in order
int oc;
int oh;
// FIXME: check kernel setting
oh = 0;
// 2. get total output
// 3. slice output
ASSERT((ih - d_kh) % stride_h == 0);
int ih_ext = pooling_ih_ext(0, 0, 0, 0, ih);
int _oh = (ih_ext - d_kh) / stride_h + 1;
for (int i = npu_num / ic; i > 0; i--) {
if (_oh % i == 0) {
// add 1 for later padding
oh = stride_h * (_oh / i - 1) + 1;
oc = i * ic;
break;
}
}
if (!oh) {
// FIXME: check terminal condition
return -1;
}
#ifdef DBG
printf("ic:ih is %d %d, oc:oh is %d:%d\n", ic, ih, oc, oh);
#endif
// tg/tl MUST be same shape size
tl_load_shape->n = tg_shape->n = 1;
tl_load_shape->c = tg_shape->c = oc;
tl_load_shape->h = tg_shape->h = oh;
tl_load_shape->w = tg_shape->w = iw;
// init tl
cvk_tl_stride_t s = ctx->ops->tl_default_stride(ctx, *tl_load_shape, dst_tl_fmt, CTRL_NULL);
tl_load_stride->n = s.n;
tl_load_stride->c = s.c;
tl_load_stride->h = s.h;
tl_load_stride->w = s.w;
// init tg
cvk_tg_stride_t gs = ctx->ops->tg_default_stride(ctx, *tg_shape, src_tg_fmt);
tg_stride->n = gs.n;
tg_stride->c = gs.c;
tg_stride->h = gs.h;
return 0;
}
/**
* \brief get proper reshape size for depthwise conv with 'same' mode in h direction
* 'pad' means \ih is padded
* \return -1 means alloc fail
* \NOTICE: not support batch/ins_x/dilated_x/pad_top/pad_bottom
*/
int cvm_reshape_channel_same_pad(
cvk_context_t* ctx, int ic, int ih, int iw, int kh, int kw, int pad_right, int pad_left,
int stride_h, int stride_w, cvk_tl_shape_t* tl_load_shape, cvk_tl_stride_t* new_tl_ifmap_stride,
cvk_tg_shape_t* new_tg_ifmap_shape, cvk_tg_stride_t* new_tg_ifmap_stride,
cvk_tl_shape_t* new_tl_weight_shape, cvk_tl_shape_t* new_tl_bias_shape,
cvk_tl_shape_t* new_tl_ofmap_shape, cvk_fmt_t fmt, int eu_align) {
ASSERT(eu_align == 0 || eu_align == 1);
cvk_chip_info_t info = ctx->info;
// TODO: verify dilation_h/dilation_w
int dilation_h = 1;
int dilation_w = 1;
// TODO: verify p->ins_h, p->ins_last_h
int d_kh = calc_dilute_hw(kh, dilation_h - 1, 0, 0, 0);
int h_after = calc_dilute_hw(ih, 0, 0, 0, 0);
int in = 1;
// int h_after = calc_dilute_hw(ih, p->ins_h, p->ins_last_h, p->pad_top, p->pad_bottom);
// int w_after = calc_dilute_hw(iw, p->ins_w, p->ins_last_w, p->pad_left, p->pad_right);
int ret = _get_dup_shape_same_pad(ctx, in, ic, h_after, iw, d_kh, stride_h, info.npu_num,
tl_load_shape, new_tl_ifmap_stride, new_tg_ifmap_shape,
new_tg_ifmap_stride, fmt, fmt);
if (ret == -1) {
return ret;
}
new_tl_weight_shape->n = 1;
new_tl_weight_shape->c = tl_load_shape->c;
new_tl_weight_shape->h = kh;
new_tl_weight_shape->w = kw;
new_tl_bias_shape->n = 2;
new_tl_bias_shape->c = tl_load_shape->c;
new_tl_bias_shape->h = 1;
new_tl_bias_shape->w = 1;
int pad_h = get_same_pad(tl_load_shape->h, stride_h, kh);
// int no_pad_h = tl_load_shape->h;
// reserve for padding
new_tg_ifmap_shape->h += pad_h;
tl_load_shape->h += pad_h;
cvk_tl_stride_t s = ctx->ops->tl_default_stride(ctx, *tl_load_shape, fmt, eu_align);
new_tl_ifmap_stride->n = s.n;
new_tl_ifmap_stride->c = s.c;
new_tl_ifmap_stride->h = s.h;
new_tl_ifmap_stride->w = s.w;
// TODO: verity ins_x
int oh = pooling_oh(0, 0, 0, 0, stride_h, tl_load_shape->h, kh, dilation_h);
int ow = pooling_ow(0, 0, pad_left, pad_right, stride_w, tl_load_shape->w, kw, dilation_w);
#ifdef DBG
printf("new oh/ow pad_h is %d/%d %d\n", oh, ow, pad_h);
#endif
new_tl_ofmap_shape->n = in;
new_tl_ofmap_shape->c = tl_load_shape->c;
new_tl_ofmap_shape->h = oh;
new_tl_ofmap_shape->w = ow;
return ret;
}

266
cvimath/src/tiu_sigmoid.c Normal file
View File

@ -0,0 +1,266 @@
/**
* implement Linear interpolation search
*
* we need to pass 2 table, one is answer(lut_answer), another is slope with
* anwser(lut_answer_slope),
*
* for example, we want to get x value
* +------+----+
* x0 x x1
*
* the [Linear interpolation defined] (https://en.wikipedia.org/wiki/Linear_interpolation) as
* flowing:
*
* part C part A part B
* +--+ +---+ +----------------------------------------+
*
* p(x) = f(x0) + ( (f(x1) - f(x0)) / (x1 - x0) ) * (x - x0)
*
* +---+ +-----------------------------+
* lut_answer lut_answer_slope
*/
#include <cvimath_internal.h>
#include "gen_lut.h" // NOLINT
//#define DBG
/*
* NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type
*
* \tl_buf tmp buffer, the shape MUST be same with \tl_ifmap
* \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used
*/
int cvm_emit_sigmoid(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_buf,
cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_slope,
cvk_tl_t* OUT tl_ofmap_bf16, float scale) {
cvm_table_check(tl_ifmap, tl_table_answer, tl_table_answer_slope, tl_buf);
cvk_fmt_t fmt = CVK_FMT_BF16;
cvk_tl_shape_t tl_ofmap_A_idx_int8_shape = {1, tl_buf->shape.c, tl_buf->shape.h * tl_buf->shape.w,
1};
cvk_tdma_l2l_tensor_copy_param_t p10;
// scale input for remap its idx(-x~x) to (-127~127), dirty tl_ifmap
cvk_tiu_mul_param_t p1;
p1.res_high = NULL;
p1.res_low = tl_ifmap;
p1.a = tl_ifmap;
p1.b_is_const = 1;
p1.b_const.val = convert_fp32_bf16(scale);
p1.rshift_bits = 0;
p1.relu_enable = 0;
ctx->ops->tiu_mul(ctx, &p1);
// <! get idx from bf16->int8
// save by stride
memset(&p10, 0x00, sizeof(cvk_tdma_l2l_tensor_copy_param_t));
cvk_tl_t dst;
memcpy(&dst, tl_ofmap_bf16, sizeof(cvk_tl_t));
dst.fmt = CVK_FMT_I8;
dst.shape = tl_ofmap_A_idx_int8_shape;
// dst.stride = ctx->ops->tl_default_stride(ctx, dst.shape, /*eu_align*/ 1,
// dst.fmt);
dst.stride = ctx->ops->tl_default_stride(ctx, dst.shape, dst.fmt, CTRL_NULL);
dst.stride.h = dst.stride.h * 2;
dst.int8_rnd_mode = 1;
p10.dst = &dst;
p10.src = tl_ifmap;
ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
dst.int8_rnd_mode = 0; // reset
// <! int8 to fb16 format cus for sub use, sub MUST in the same format
memset(&p10, 0x00, sizeof(cvk_tdma_l2l_tensor_copy_param_t));
p10.dst = tl_buf; //<! bf16
p10.src = &dst;
ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
// <! sub, diff base , a - b
// (x - x0)
cvk_tiu_sub_param_t p5;
p5.res_high = 0;
p5.res_low = tl_ifmap;
p5.a_high = 0;
p5.a_low = tl_ifmap;
p5.b_high = 0;
p5.b_low = tl_buf;
p5.rshift_bits = 0;
ctx->ops->tiu_sub(ctx, &p5);
// get f(x0) and slope(x)
// reshape, 16->16
dst.fmt = fmt;
dst.shape = tl_buf->shape;
dst.stride = tl_buf->stride;
// <! get slope by index
// <! ( (f(x1) - f(x0)) / (x1 - x0) )
// <! TIU MUST with same shape and stride, we leverage output map shape and stride
cvk_tiu_lookup_table_param_t p12;
memset(&p12, 0x0, sizeof(cvk_tiu_lookup_table_param_t));
p12.ofmap = tl_buf;
p12.ifmap = &dst;
p12.table = tl_table_answer_slope;
ctx->ops->tiu_lookup_table(ctx, &p12);
// base f(x0)
memset(&p12, 0x0, sizeof(cvk_tiu_lookup_table_param_t));
p12.ofmap = tl_ofmap_bf16;
p12.ifmap = &dst;
p12.table = tl_table_answer;
ctx->ops->tiu_lookup_table(ctx, &p12);
// <! mac
// <! part A + part B, a *.b.b + res = res
cvk_tiu_mac_param_t p2;
p2.res_high = 0;
p2.res_low = tl_ofmap_bf16;
p2.res_is_int8 = 0;
p2.a = tl_ifmap;
p2.b_is_const = 0;
p2.b = tl_buf;
p2.lshift_bits = 0; // lshift_bits;
p2.rshift_bits = 0; // rshift_bits;
p2.relu_enable = 0;
ctx->ops->tiu_mac(ctx, &p2);
return 0;
}
static double _gen_sigmoid(float x) { return 1.0 / (1.0 + exp(-(x))); }
double* cvm_gen_sigmoid_double() {
int table_hw = cvm_table_hw();
return (double*)malloc(sizeof(double) * table_hw);
}
void cvm_free_sigmoid_double(double* sigmode_hw) { free(sigmode_hw); }
void cvm_gen_sigmoid(uint16_t* table_data, cvk_tl_shape_t* table_shape, double* sigmode_hw,
float scale, int range_start) {
// S(x) = 1 / (1 + (e^-x))
//<! 32*8 table, duplicate `channel` times;
uint64_t idx = 0;
ASSERT(is_1880v2_tbl_shape(table_shape));
int half = half_h_table();
int table_hw = cvm_table_hw();
// prepare channel 0
// x [0, 127]
// we re-scale [-8, 8] into 256
for (int i = 0; i < half; i++) {
float _idx = idx / scale;
double s = _gen_sigmoid(_idx);
sigmode_hw[idx] = s;
table_data[idx] = convert_fp32_bf16((float)s);
#ifdef GDB
printf("t [%lu] is %f[%d], 0x%x fp is %f d is %.8lf, input is %f\n", idx,
convert_bf16_fp32(table_data[idx]), i, table_data[idx], (float)s, s, _idx);
#endif
idx++;
}
// x = -128
double s = _gen_sigmoid(range_start);
sigmode_hw[idx] = s;
table_data[idx] = convert_fp32_bf16((double)s);
#ifdef GDB
printf("t [%lu] is %f[%d], 0x%x fp is %f d is %.8lf input is %d\n", idx,
convert_bf16_fp32(table_data[idx]), -128, table_data[idx], (float)s, s, range_start);
#endif
idx++;
// x [-128~-1], 2's complement
for (int i = 1; i < half; i++) {
float _idx = (i) / scale;
double s = _gen_sigmoid(range_start + _idx);
sigmode_hw[idx] = s;
table_data[idx] = convert_fp32_bf16((double)s);
#ifdef GDB
printf("t [%lu] is %f[%d], 0x%x fp is %f d is %.8lf input is %f\n", idx,
convert_bf16_fp32(table_data[idx]), -127 + i, table_data[idx], (float)s, s,
range_start + _idx);
#endif
idx++;
}
// duplicate channel #1 to #31
// TODO: tensor copy
for (uint32_t i = 1; i < table_shape->c; i++) {
memcpy(&table_data[i * table_hw], &table_data[0], sizeof(uint16_t) * table_hw);
}
}
float cvm_sigmoid_scale(int range_start, int range_end) {
int table_hw = cvm_table_hw();
return table_hw / (1.0 * abs(range_start - range_end)); // 256 / 16 = 16
}
void cvm_gen_sigmoid_slope(uint16_t* OUT table_slope, cvk_tl_shape_t* table_shape,
double* sigmode_hw, float scale, int range_start, int range_end) {
ASSERT(is_1880v2_tbl_shape(table_shape));
int half = half_h_table();
int table_hw = cvm_table_hw();
for (int i = 0; i < table_hw; i++) {
double x0 = sigmode_hw[i];
// double x1 = sigmode_hw[i + 1];
double x1;
double delta = 1.0;
if (i == half - 1) {
//<! slope[127] means f(127)~f(128)
double f = _gen_sigmoid(range_end);
// uint16_t bf16 = convert_fp32_bf16(f);
// x1 = convert_bf16_fp32(bf16);
x1 = f;
} else if (i == half) {
// 128 index mean x1 is -129 and x0 is -128
x1 = _gen_sigmoid(range_start - 1 / scale);
delta = -1.0;
} else if (i > half) {
x0 = sigmode_hw[i];
x1 = sigmode_hw[i - 1];
delta = -1.0;
} else {
// for avoid -fsanitize=address check
x1 = sigmode_hw[i + 1];
}
double s = (x1 - x0) / delta; // x1 already scale up
table_slope[i] = convert_fp32_bf16((float)s);
#ifdef GDB
printf("slope table [%u] = (bf16 %f double %.8lf float %f), 0x%x, %.8lf - %.8lf(%.8lf)\n", i,
convert_bf16_fp32(table_slope[i]), s, (float)s, table_slope[i], x1, x0, x1 - x0);
#endif
}
// duplicate channel #1 to #31
// TODO: tensor copy
for (uint64_t i = 1; i < table_shape->c; i++) {
memcpy(&table_slope[table_hw * i], &table_slope[0], sizeof(uint16_t) * table_hw);
}
}
void cvm_sigmoid_tbl(uint16_t* sigmoid_table_data, uint16_t* sigmoid_table_data_slope,
cvk_tl_shape_t* table_shape, int range_start, int range_end) {
ASSERT(sigmoid_table_data);
ASSERT(sigmoid_table_data_slope);
ASSERT(table_shape);
double* sigmode_hw = cvm_gen_sigmoid_double();
float scale = cvm_sigmoid_scale(range_start, range_end);
cvm_gen_sigmoid(sigmoid_table_data, table_shape, sigmode_hw, scale, range_start);
cvm_gen_sigmoid_slope(sigmoid_table_data_slope, table_shape, sigmode_hw, scale, range_start,
range_end);
cvm_free_sigmoid_double(sigmode_hw);
}

121
cvimath/src/tiu_sqrt.c Normal file
View File

@ -0,0 +1,121 @@
/**
*/
#include <cvimath_internal.h>
#include "gen_lut.h" // NOLINT
//#define DBG
/*
* NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type
*
* \tl_buf tmp buffer, the shape MUST be same with \tl_ifmap
* \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used
*/
int cvm_emit_sqrt(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_buf,
cvk_tl_t* tbl_answer, cvk_tl_t* tbl_answer_mantissa,
cvk_tl_t* OUT tl_ofmap_bf16) {
return cvm_lut_exp_mantissa(ctx, tl_ifmap, tl_buf, tbl_answer, tbl_answer_mantissa,
tl_ofmap_bf16);
}
static double _gen_sqrt(int base, int p) {
// y = x ^ 0.5
double f = (double)(pow(base, p * 0.5));
if (isnan(f)) {
ASSERT(0);
}
return f;
}
void cvm_gen_sqrt(uint16_t* table_data, cvk_tl_shape_t* table_shape) {
ASSERT(is_1880v2_tbl_shape(table_shape));
int exp_start = cvm_exp_start();
int half = half_h_table();
int table_hw = cvm_table_hw();
uint64_t idx = 0;
// prepare channel 0
double s = 0.0;
table_data[idx] = convert_fp32_bf16(s); // 0^0.5 = 0
#ifdef DBG
printf("t [%lu] is %f(%.8lf)[idx:%f][2^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s,
(float)exp_start, (float)(exp_start / 2), table_data[idx]);
#endif
idx++;
// > 0, exp from 0 -62 -61 .. 62 63
for (int i = 0; i < half; i++) {
int shift = (exp_start + i);
bool is_odd = (shift % 2);
float exp = shift;
if (is_odd) {
exp = exp - 1;
}
double s = _gen_sqrt(2, exp);
table_data[idx] = convert_fp32_bf16(s);
#ifdef DBG
printf("t [%lu] is %f [idx:%f][2^%f(%f)] bf %x\n", idx, convert_bf16_fp32(table_data[idx]),
(float)(exp_start + i), exp / 2, (exp_start + i) / 2.0, table_data[idx]);
#endif
idx++;
}
//// idx = 127 dont care
// duplicate channel #1 to #channel
// TODO: tensor copy
for (uint32_t i = 1; i < table_shape->c; i++) {
memcpy(&table_data[i * table_hw], &table_data[0], sizeof(uint16_t) * table_hw);
}
}
void cvm_gen_sqrt_mantissa(uint16_t* OUT table_mantissa, cvk_tl_shape_t* table_shape) {
ASSERT(is_1880v2_tbl_shape(table_shape));
uint32_t half = half_h_table();
int table_hw = cvm_table_hw();
int idx = 0;
double d;
for (uint32_t i = 0; i < half; i++) {
d = 1 + i * 1 / 128.0;
d = (double)pow(d, 0.5);
table_mantissa[128 + idx] = convert_fp32_bf16(d);
#ifdef DBG
// printf(", [%u] is %lf\n", i+128, d);
#endif /* ifdef DBG */
// 13=2^3x1.625=(2^2)x(2^1x1.625)
d = 2 * (1 + i * 1 / 128.0);
d = (double)pow(d, 0.5);
table_mantissa[idx] = convert_fp32_bf16(d);
#ifdef DBG
// printf("mantissa [%u] is %lf", i, d);
#endif /* ifdef DBG */
idx++;
}
#ifdef DBG
for (uint32_t i = 0; i < 2 * half; i++) {
printf("mantissa [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_mantissa[i]),
table_mantissa[i]);
}
#endif /* ifdef DBG */
// duplicate channel #1 to #31
// TODO: tensor copy
for (uint64_t i = 1; i < table_shape->c; i++) {
memcpy(&table_mantissa[table_hw * i], &table_mantissa[0], sizeof(uint16_t) * table_hw);
}
}
void cvm_sqrt_tbl(uint16_t* sqrt_table_data, uint16_t* sqrt_table_data_mantissa,
cvk_tl_shape_t* table_shape) {
ASSERT(sqrt_table_data);
ASSERT(sqrt_table_data_mantissa);
ASSERT(table_shape);
cvm_gen_sqrt(sqrt_table_data, table_shape);
cvm_gen_sqrt_mantissa(sqrt_table_data_mantissa, table_shape);
}

View File

@ -0,0 +1,54 @@
#include <cvimath_internal.h>
#include "gen_lut.h"
int cvm_upsample2d(cvk_context_t* ctx, cvk_tl_t* tl_input, cvk_tl_t* tl_weight,
cvk_tl_t* tl_output) {
int ih = tl_input->shape.h;
int iw = tl_input->shape.w;
int sh = tl_weight->shape.h;
int sw = tl_weight->shape.w;
int kh = sh;
int kw = sw;
int pt = 0;
int pl = 0;
int pr = 0;
int pb = 0;
int dh = 1;
int dw = 1;
int ow = tl_output->shape.w;
int oh = tl_output->shape.h;
int kh_ext = (kh - 1) * dh + 1;
int kw_ext = (kw - 1) * dw + 1;
int ins_h = sh - 1;
int ins_w = sw - 1;
int pad_t = kh_ext - pt - 1;
int pad_l = kw_ext - pl - 1;
int pad_b = oh + pb - (ih - 1) * sh - 1;
int pad_r = ow + pr - (iw - 1) * sw - 1;
cvk_tiu_depthwise_pt_convolution_param_t param = {0};
param.ofmap = tl_output;
param.ifmap = tl_input;
param.weight = tl_weight;
param.bias = 0;
param.ins_h = ins_h;
param.ins_last_h = 0;
param.ins_w = ins_w;
param.ins_last_w = 0;
param.stride_h = 1;
param.stride_w = 1;
param.dilation_h = 1;
param.dilation_w = 1;
param.pad_top = pad_t;
param.pad_bottom = pad_b;
param.pad_left = pad_l;
param.pad_right = pad_r;
param.relu_enable = 0;
param.ins_val = 0; // symmetric quantization
param.ins_fp = 0; // symmetric quantization
ctx->ops->tiu_pt_depthwise_convolution(ctx, &param);
return 0;
}

270
cvimath/src/util.c Normal file
View File

@ -0,0 +1,270 @@
#include <assert.h>
#include <cvikernel/cvikernel.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include "test_cvikernel_util.h"
#define container_of(ptr, type, member) \
({ \
const typeof(((type *)0)->member) *__mptr = (ptr); \
(type *)((char *)__mptr - offsetof(type, member)); \
})
typedef struct {
cvk_tg_t tg;
CVI_RT_MEM mem;
} test_tg_wrapper_t;
typedef struct {
cvk_mg_t mg;
CVI_RT_MEM mem;
} test_mg_wrapper_t;
void test_submit_comp(CVI_RT_HANDLE *bm_ctx, cvk_context_t *cvk_ctx) {
(void)cvk_ctx;
(void)bm_ctx;
CVI_RT_Submit(cvk_ctx);
}
cvk_tg_t *test_alloc_tg_mem_comp(CVI_RT_HANDLE *bm_ctx, cvk_context_t *cvk_ctx,
cvk_tg_shape_t shape, cvk_fmt_t fmt) {
CVI_RT_HANDLE ctx = (CVI_RT_HANDLE)*bm_ctx;
int alloc_sz = tg_shape_size(&shape) * bytesize_of_fmt(fmt);
test_tg_wrapper_t *w = (test_tg_wrapper_t *)malloc(sizeof(*w));
assert(w && "Expected allocated tg wrapper");
w->tg.base_reg_index = 0;
w->mem = CVI_RT_MemAlloc(ctx, alloc_sz);
w->tg.start_address = CVI_RT_MemGetPAddr(w->mem);
w->tg.fmt = fmt;
w->tg.shape = shape;
w->tg.stride = cvk_ctx->ops->tg_default_stride(cvk_ctx, shape, fmt);
return &w->tg;
}
cvk_mg_t *test_alloc_mg_mem_comp(CVI_RT_HANDLE *bm_ctx, cvk_mg_shape_t s, cvk_fmt_t fmt) {
int alloc_sz = mg_shape_size(&s) * bytesize_of_fmt(fmt);
CVI_RT_HANDLE ctx = (CVI_RT_HANDLE)*bm_ctx;
test_mg_wrapper_t *w = (test_mg_wrapper_t *)malloc(sizeof(*w));
w->mem = CVI_RT_MemAlloc(ctx, alloc_sz);
w->mg.base_reg_index = 0;
w->mg.start_address = CVI_RT_MemGetPAddr(w->mem);
w->mg.shape = s;
w->mg.fmt = fmt;
w->mg.stride.row = s.col * bytesize_of_fmt(fmt);
return &w->mg;
}
void test_free_tg_mem_comp(CVI_RT_HANDLE *ctx, const cvk_tg_t *tg) {
test_tg_wrapper_t *w = container_of(tg, test_tg_wrapper_t, tg);
CVI_RT_MemFree(*ctx, w->mem);
free(w);
}
void test_free_mg_mem_comp(CVI_RT_HANDLE *ctx, const cvk_mg_t *mg) {
test_mg_wrapper_t *w = container_of(mg, test_mg_wrapper_t, mg);
CVI_RT_MemFree(*ctx, w->mem);
free(w);
}
void test_put_tg_mem_comp(CVI_RT_HANDLE *bm_ctx, const cvk_tg_t *tg, uint8_t data[]) {
test_tg_wrapper_t *w = container_of(tg, test_tg_wrapper_t, tg);
CVI_RT_MemCopyS2D(*bm_ctx, w->mem, data);
}
void test_put_mg_mem_comp(CVI_RT_HANDLE *ctx, const cvk_mg_t *mg, uint8_t data[]) {
test_mg_wrapper_t *w = (typeof(w))mg;
CVI_RT_MemCopyS2D(*ctx, w->mem, data);
}
uint8_t *test_get_tg_mem_comp(CVI_RT_HANDLE *bm_ctx, const cvk_tg_t *tg) {
cvk_tg_shape_t s = tg->shape;
int data_type_size = 1;
if (tg->fmt == CVK_FMT_BF16) {
data_type_size = 2;
}
uint32_t size = s.n * s.c * s.h * s.w * data_type_size;
uint8_t *data = (uint8_t *)malloc(size);
assert(data && "Expect allocated data for get tg mem");
test_tg_wrapper_t *w = container_of(tg, test_tg_wrapper_t, tg);
CVI_RT_MemCopyD2S(*bm_ctx, data, w->mem);
return data;
}
uint8_t *test_get_mg_mem_comp(CVI_RT_HANDLE *ctx, const cvk_mg_t *mg) {
cvk_mg_shape_t s = mg->shape;
uint32_t size = s.row * s.col * (mg->fmt == CVK_FMT_BF16 ? 2 : 1);
uint8_t *data = (uint8_t *)malloc(size);
assert(data && "Expect allocated data for get mg mem");
test_mg_wrapper_t *w = container_of(mg, test_mg_wrapper_t, mg);
CVI_RT_MemCopyD2S(*ctx, data, w->mem);
return data;
}
uint8_t *test_get_tensor_l2g_comp(CVI_RT_HANDLE *bm_ctx, cvk_context_t *cvk_ctx,
const cvk_tl_t *tl) {
cvk_tg_shape_t s;
s.n = tl->shape.n;
s.c = tl->shape.h;
s.h = tl->shape.w;
s.w = tl->shape.c;
cvk_tg_t *tg = test_alloc_tg_mem_comp(bm_ctx, cvk_ctx, s, tl->fmt);
cvk_tdma_l2g_tensor_copy_param_t p;
p.src = tl;
p.dst = tg;
if (tl->fmt == CVK_FMT_BF16) {
cvk_ctx->ops->tdma_l2g_bf16_tensor_copy(cvk_ctx, &p);
} else {
cvk_ctx->ops->tdma_l2g_tensor_copy(cvk_ctx, &p);
}
test_submit_comp(bm_ctx, cvk_ctx);
uint8_t *data = test_get_tg_mem_comp(bm_ctx, tg);
test_free_tg_mem_comp(bm_ctx, tg);
return data;
}
uint8_t *test_get_matrix_l2g_comp(CVI_RT_HANDLE *ctx, cvk_context_t *cvk_ctx, const cvk_ml_t *ml) {
cvk_mg_shape_t s;
s.row = ml->shape.n;
s.col = ml->shape.col;
cvk_mg_t *mg = test_alloc_mg_mem_comp(ctx, s, ml->fmt);
cvk_tdma_l2g_matrix_copy_param_t p;
p.src = ml;
p.dst = mg;
if (ml->fmt == CVK_FMT_BF16) {
cvk_ctx->ops->tdma_l2g_bf16_matrix_copy(cvk_ctx, &p);
} else {
cvk_ctx->ops->tdma_l2g_matrix_copy(cvk_ctx, &p);
}
test_submit_comp(ctx, cvk_ctx);
uint8_t *data = test_get_mg_mem_comp(ctx, mg);
test_free_mg_mem_comp(ctx, mg);
return data;
}
void test_put_tensor_g2l_comp(CVI_RT_HANDLE *bm_ctx, cvk_context_t *cvk_ctx, const cvk_tl_t *tl,
uint8_t data[]) {
cvk_tg_shape_t tg_shape;
tg_shape.n = tl->shape.n;
tg_shape.c = tl->shape.c;
tg_shape.h = tl->shape.h;
tg_shape.w = tl->shape.w;
cvk_tg_t *tg = test_alloc_tg_mem_comp(bm_ctx, cvk_ctx, tg_shape, tl->fmt);
cvk_tdma_g2l_tensor_copy_param_t p;
p.src = tg;
p.dst = tl;
test_put_tg_mem_comp(bm_ctx, tg, data);
if (tl->fmt == CVK_FMT_BF16) {
cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &p);
} else {
cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p);
}
test_submit_comp(bm_ctx, cvk_ctx);
test_free_tg_mem_comp(bm_ctx, tg);
}
void test_put_matrix_g2l_comp(CVI_RT_HANDLE *bm_ctx, cvk_context_t *cvk_ctx, const cvk_ml_t *ml,
uint8_t data[]) {
cvk_fmt_t mg_data_format = ml->fmt;
cvk_mg_shape_t s;
s.row = ml->shape.n;
s.col = ml->shape.col;
cvk_mg_t *mg = test_alloc_mg_mem_comp(bm_ctx, s, mg_data_format);
cvk_tdma_g2l_matrix_copy_param_t p;
p.src = mg;
p.dst = ml;
test_put_mg_mem_comp(bm_ctx, mg, data);
if (ml->fmt == CVK_FMT_BF16) {
cvk_ctx->ops->tdma_g2l_bf16_matrix_copy(cvk_ctx, &p);
} else {
cvk_ctx->ops->tdma_g2l_matrix_copy(cvk_ctx, &p);
}
test_submit_comp(bm_ctx, cvk_ctx);
test_free_mg_mem_comp(bm_ctx, mg);
}
cvk_mg_t *test_put_matrix_g(CVI_RT_HANDLE *bm_ctx, const cvk_mg_shape_t s, cvk_fmt_t mg_data_format,
uint8_t data[]) {
cvk_mg_t *mg = test_alloc_mg_mem_comp(bm_ctx, s, mg_data_format);
test_put_mg_mem_comp(bm_ctx, mg, data);
return mg;
}
cvk_tl_t *test_alloc_tl(cvk_context_t *cvk_ctx, cvk_tl_shape_t shape, cvk_fmt_t fmt, int eu_align) {
cvk_tl_t *tl = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt, eu_align);
return tl;
}
void test_free_tl(cvk_context_t *cvk_ctx, const cvk_tl_t *t) {
cvk_ctx->ops->lmem_free_tensor(cvk_ctx, t);
}
#define CNV_SCALAR_C_ALIGN (0x1000)
inline uint64_t cnvAlign64(const uint64_t length, const uint64_t align) {
uint64_t stride = (uint64_t)(length / align) * align;
if (stride < length) {
stride += align;
}
return stride;
}
uint8_t *test_get_vp_addr(bmctx_t *ctx, AddrInfo *pAddrInfo)
{
if(pAddrInfo->vir_addr){
test_free_vp_addr(ctx, pAddrInfo);
}
pAddrInfo->mem = bmmem_device_alloc_raw(*ctx, pAddrInfo->size_bytes);
pAddrInfo->vir_addr = (uint8_t *)bmmem_device_v_addr(pAddrInfo->mem);;
pAddrInfo->phy_addr = bmmem_device_addr(pAddrInfo->mem);
uint64_t new_paddr = cnvAlign64(pAddrInfo->phy_addr, CNV_SCALAR_C_ALIGN);
uint64_t offset = new_paddr - pAddrInfo->phy_addr;
pAddrInfo->phy_addr = new_paddr;
pAddrInfo->vir_addr += offset;
return pAddrInfo->vir_addr;
}
void test_free_vp_addr(bmctx_t *ctx, AddrInfo *pAddrInfo){
bmmem_device_free(*ctx, pAddrInfo->mem);
pAddrInfo->phy_addr = -1;
pAddrInfo->vir_addr = NULL;
//pAddrInfo->size_bytes = 0;
}

View File

@ -0,0 +1,34 @@
project(cvimath)
include(CTest)
include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include")
file(GLOB _TEST_UTILS "${CMAKE_CURRENT_SOURCE_DIR}/common/*c")
# cvi1835 test
include_directories(
${CMAKE_SOURCE_DIR}/include
${CMAKE_SOURCE_DIR}/src
)
file(GLOB CVI1835_TESTS cvi1835/*.cpp)
# FIXME: repair test case
list(FILTER CVI1835_TESTS EXCLUDE REGEX ".*atan2.*")
list(FILTER CVI1835_TESTS EXCLUDE REGEX ".*depthwise_reshape_same.*")
foreach(TEST_SRC ${CVI1835_TESTS})
get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
add_executable(${TEST_NAME} ${_TEST_UTILS} ${TEST_SRC})
target_link_libraries(${TEST_NAME} ${TPU_KERNEL_LIB} ${TEST_LIBS})
set_target_properties(${TEST_NAME} PROPERTIES COMPILE_FLAGS "-Werror -Wall -Wextra")
install(TARGETS ${TEST_NAME} DESTINATION bin)
add_test(${TEST_NAME} ${TEST_NAME} ctest_test)
endforeach()
#add_library(${PROJECT_NAME} SHARED ${SRC})
#target_link_libraries(${PROJECT_NAME} ${TPU_KERNEL_LIB})
#install(TARGETS ${PROJECT_NAME} DESTINATION tests)

View File

@ -0,0 +1,980 @@
#include <assert.h>
#include <errno.h>
#include <float.h>
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <test_native_ref.h>
#define math_min(x, y) ((x) < (y) ? (x) : (y))
#define math_max(x, y) ((x) > (y) ? (x) : (y))
typedef uint8_t uint8_t;
typedef uint16_t uint16_t;
typedef uint32_t uint32_t;
typedef uint64_t uint64_t;
typedef int8_t int8_t;
typedef int16_t int16_t;
typedef int32_t int32_t;
typedef int64_t s64;
typedef uint32_t bmerr_t;
#define BM_SUCCESS 0 // The operation was successful
#define BM_ERR_AGAIN 1 // Not ready yet
#define BM_ERR_FAILURE 2 // General failure
#define BM_ERR_TIMEOUT 3 // Timeout
#define BM_ERR_UNINITIALIZED 4 // Uninitialzed
#define BM_ERR_INVALID_ARGUMENT 5 // Arguments invalid
#define BM_ERR_NOMEM 6 // Not enough memory
#define BM_ERR_DATA 7 // Data error
#define BM_ERR_BUSY 8 // Busy
#define BM_ERR_NOT_SUPPORTED 9 // Not supported yet
typedef uint32_t BLOB_OP;
#define BLOB_ADD 0
#define BLOB_SUB 1
#define BLOB_MUL 2
#define BLOB_DIV 3
#define BLOB_INVALID 4
static inline int calc_offset(int *shape, int *offset) {
return ((offset[0] * shape[1] + offset[1]) * shape[2] + offset[2]) * shape[3] + offset[3];
}
static int index_get(int h, int w1, int w2) { return h * w1 + w2; }
int array_cmp_float_rel(const char *const info, float *p_exp, float *p_got, int count,
float delta) {
int idx = 0;
for (idx = 0; idx < count; idx++) {
if (math_max(fabs(p_exp[idx]), fabs(p_got[idx])) > 1.0) {
// compare rel
if (math_min(fabs(p_exp[idx]), fabs(p_got[idx])) < 1e-20) {
printf("%s rel error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]);
if (isnan(p_exp[idx]) && isnan(p_got[idx])) {
printf("both exp and got are NAN");
return 0;
}
return -1;
}
if (fabs(p_exp[idx] - p_got[idx]) > delta * math_min(fabs(p_exp[idx]), fabs(p_got[idx]))) {
printf("%s rel error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]);
if (isnan(p_exp[idx]) && isnan(p_got[idx])) {
printf("both exp and got are NAN");
return 0;
}
return -1;
}
} else {
if (fabs(p_exp[idx] - p_got[idx]) > delta) {
printf("%s abs error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]);
if (isnan(p_exp[idx]) && isnan(p_got[idx])) {
printf("both exp and got are NAN");
return 0;
}
return -1;
}
}
if (isnan(p_got[idx]) && !isnan(p_exp[idx])) {
printf("%s, found nans idx %d\n", info, idx);
printf("floating from exp %.10f got %.10f\n", p_exp[idx], p_got[idx]);
IF_VAL exp, got;
exp.fval = p_exp[idx];
got.fval = p_got[idx];
printf("hex form exp %8.8x got %8.8x\n", exp.ival, got.ival);
return -2;
}
}
return 0;
}
int array_cmp_float(const char *const info, float *p_exp, float *p_got, int count, float delta) {
if (delta == 0.0f) {
for (int idx = 0; idx < count; idx++) {
if (p_exp[idx] != p_got[idx]) {
printf("%s error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]);
if (isnan(p_exp[idx]) && isnan(p_got[idx])) {
printf("both exp and got are NAN\n");
return 0;
}
return -1;
}
}
} else {
return array_cmp_float_rel(info, p_exp, p_got, count, delta);
}
return 0;
}
int array_cmp_int(const char *const info, int *p_exp, int *p_got, int count) {
int idx;
for (idx = 0; idx < count; idx++) {
if (p_exp[idx] != p_got[idx]) {
printf("%s error at index %d exp %d got %d\n", info, idx, p_exp[idx], p_got[idx]);
return -1;
}
}
return 0;
}
int array_cmp_int8(const char *const info, const int8_t *p_exp, const int8_t *p_got, int count) {
int idx;
for (idx = 0; idx < count; idx++) {
if (p_exp[idx] != p_got[idx]) {
printf("%s error at index %d exp %d got %d\n", info, idx, p_exp[idx], p_got[idx]);
return -1;
}
}
return 0;
}
int calc_dilute_hw(int h, int ins_h, int ins_h_l, int pad_h_b, int pad_h_t) {
return (h - 1) * (ins_h + 1) + ins_h_l + 1 + pad_h_t + pad_h_b;
}
int calc_output_hw(int hw, int khw, int stride) { return (hw - khw) / stride + 1; }
int fill_pad_fmap_int8(const int8_t *before, int8_t **pafter, int val, int pad_l, int pad_r,
int pad_t, int pad_b, int ins_h, int ins_w, int ins_h_last, int ins_w_last,
int h_before, int w_before) {
int w_after = (w_before - 1) * (ins_w + 1) + ins_w_last + 1 + pad_l + pad_r;
int h_after = (h_before - 1) * (ins_h + 1) + ins_h_last + 1 + pad_t + pad_b;
int8_t *after = *pafter;
if (!before || !pafter) return BM_ERR_INVALID_ARGUMENT;
if (!after) {
after = malloc(sizeof(int8_t) * w_after * h_after);
if (!after) return BM_ERR_NOMEM;
}
memset(after, val, w_after * h_after);
for (int h = 0; h < h_before; h++) {
for (int w = 0; w < w_before; w++) {
int i = (h * (ins_h + 1) + pad_t) * w_after + w * (ins_w + 1) + pad_l;
after[i] = before[h * w_before + w];
}
}
*pafter = after;
return BM_SUCCESS;
}
int fill_pad_fmap_bf16(const uint16_t *before, uint16_t **pafter, int val, int pad_l, int pad_r,
int pad_t, int pad_b, int ins_h, int ins_w, int ins_h_last, int ins_w_last,
int h_before, int w_before) {
int w_after = (w_before - 1) * (ins_w + 1) + ins_w_last + 1 + pad_l + pad_r;
int h_after = (h_before - 1) * (ins_h + 1) + ins_h_last + 1 + pad_t + pad_b;
uint16_t *after = *pafter;
if (!before || !pafter) return BM_ERR_INVALID_ARGUMENT;
if (!after) {
after = malloc(sizeof(uint16_t) * w_after * h_after);
if (!after) return BM_ERR_NOMEM;
}
for (int i = 0; i < w_after * h_after; i++) after[i] = val;
for (int h = 0; h < h_before; h++) {
for (int w = 0; w < w_before; w++) {
int i = (h * (ins_h + 1) + pad_t) * w_after + w * (ins_w + 1) + pad_l;
after[i] = before[h * w_before + w];
}
}
#if 0
printf("bf16 padding:\n");
for(int i=0;i<h_after;i++) {
printf("[\n");
for(int j=0;j<w_after;j++)
printf("%04x ", (after[i*w_after+j]));
printf("\n");
}
printf("]\n");
#endif
*pafter = after;
return BM_SUCCESS;
}
void fill_int_with_int8(int *pdest, int8_t *psrc, int len) {
for (int ii = 0; ii < len; ii++) pdest[ii] = (int)psrc[ii];
}
void fill_int_with_uint8(int *pdest, uint8_t *psrc, int len) {
for (int ii = 0; ii < len; ii++) pdest[ii] = psrc[ii];
}
void fill_int_with_int16(int *pdest, int16_t *psrc, int len) {
for (int ii = 0; ii < len; ii++) {
pdest[ii] = (int16_t)psrc[ii];
}
}
void inner_product(const int *a, const int *b, int len, int *c) {
*c = 0;
for (int ii = 0; ii < len; ii++) {
*c += (a[ii] * b[ii]);
}
}
void inner_float_product(const float *a, const float *b, int len, float *c) {
*c = 0;
for (int ii = 0; ii < len; ii++) {
*c += (a[ii] * b[ii]);
}
}
int fill_pad_fmap_fp32(const float *before, float **after, float pad_value, int pad_h_t,
int pad_h_b, int pad_w_l, int pad_w_r, int ins_h, int ins_w, int ins_h_l,
int ins_w_l, int h, int w) {
int h_after = calc_dilute_hw(h, ins_h, ins_h_l, pad_h_b, pad_h_t);
int w_after = calc_dilute_hw(w, ins_w, ins_w_l, pad_w_l, pad_w_r);
float *ofmap = NULL;
if (before == NULL || after == NULL) {
return BM_ERR_INVALID_ARGUMENT;
}
if (*after == NULL && (*after = malloc(sizeof(float) * h_after * w_after)) == NULL) {
printf("No enough memory: [h_after, w_after]=[%i, %i].\n", h_after, w_after);
return BM_ERR_NOMEM;
}
ofmap = *after;
for (int i = 0; i < h_after * w_after; i++) {
ofmap[i] = pad_value;
}
for (int i = 0; i < h; i++) {
float *start_addr = ofmap + (pad_h_t + i * (ins_h + 1)) * w_after + pad_w_l;
int ins_h_count = (i == h - 1) ? ins_h_l : ins_h;
for (int j = 0; j < ins_h_count + 1; j++) {
memset(start_addr + j * w_after, 0, sizeof(float) * (w_after - pad_w_l - pad_w_r));
}
for (int j = 0; j < w; j++) {
start_addr[j * (ins_w + 1)] = before[i * w + j];
}
}
return BM_SUCCESS;
}
void native_md_scalar(float *a, float *b, float *r, int N, int C, int H, int W, int op,
bool result_add) {
int count = N * C * H * W;
for (int i = 0; i < count; i++) {
switch (op) {
case BLOB_ADD:
r[i] = a[i] + b[i];
break;
case BLOB_SUB:
r[i] = a[i] - b[i];
break;
case BLOB_MUL:
r[i] = result_add ? r[i] : 0;
r[i] += a[i] * b[i];
break;
case BLOB_DIV:
r[i] = a[i] / b[i];
break;
default:
assert(0);
break;
}
}
}
void native_md_scalar_int8(int8_t *a, int8_t *b, int8_t *r, int N, int C, int H, int W, int op,
bool result_add) {
int count = N * C * H * W;
for (int i = 0; i < count; i++) {
switch (op) {
case BLOB_ADD:
r[i] = a[i] + b[i];
break;
case BLOB_SUB:
r[i] = a[i] - b[i];
break;
case BLOB_MUL:
r[i] = result_add ? r[i] : 0;
r[i] += a[i] * b[i];
break;
case BLOB_DIV:
r[i] = a[i] / b[i];
break;
default:
assert(0);
break;
}
}
}
static int matrix_dot_mult(int8_t *A, int8_t *B, int dim_n, int dim_m, int opd0_sign) {
int sum = 0;
for (int i = 0; i < dim_n; i++) {
for (int j = 0; j < dim_m; j++) {
int index = index_get(i, dim_m, j);
if (opd0_sign) {
sum += A[index] * B[index];
} else {
sum += (int)((uint8_t)A[index]) * B[index];
}
}
}
return sum;
}
int native_conv_int8(const int8_t *ifmap, const int8_t *weight, const int16_t *bias, int8_t *ofmap,
int in, int ic, int ih, int iw, int oc, int kh, int kw, int dh, int dw,
int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h, int stride_w,
int ins_h, int ins_w, int ins_h_last, int ins_w_last, int input_sign,
int r_shift_width, int do_relu) {
int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last, pad_h_t, pad_h_b);
int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last, pad_w_l, pad_w_r);
int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
memset(result, 0, sizeof(int) * in * oc * oh * ow);
int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
int ret = BM_SUCCESS;
int8_t *i_fmap_pad = NULL;
int8_t *kernel_after = NULL;
for (int n = 0; n < in; ++n) {
for (int c = 0; c < oc; ++c) {
for (int cc = 0; cc < ic; ++cc) {
fill_pad_fmap_int8((int8_t *)ifmap + n * ic * ih * iw + cc * ih * iw, &i_fmap_pad, 0,
pad_w_l, pad_w_r, pad_h_t, pad_h_b, ins_h, ins_w, ins_h_last, ins_w_last,
ih, iw);
// kernel_dilation(
fill_pad_fmap_int8((weight + c * ic * kh * kw + cc * kh * kw), &kernel_after, 0, 0, 0, 0,
0, // no padding
dh - 1, dw - 1, 0, 0, kh, kw);
for (int ph = 0; ph < oh; ++ph) {
for (int pw = 0; pw < ow; ++pw) {
for (int idxh = 0; idxh < kh_ext; ++idxh)
for (int idxw = 0; idxw < kw_ext; ++idxw) {
i_fmap_pad_ker[idxh * kw_ext + idxw] =
i_fmap_pad[(idxh + ph * stride_h) * iw_ext + idxw + pw * stride_w];
}
result[n * oc * oh * ow + c * oh * ow + ph * ow + pw] +=
matrix_dot_mult(i_fmap_pad_ker, kernel_after, kh_ext, kw_ext, input_sign);
}
}
}
if (bias) {
for (int ph = 0; ph < oh; ++ph) {
for (int pw = 0; pw < ow; ++pw) {
result[n * oc * oh * ow + c * oh * ow + ph * ow + pw] += bias[c]; // bias+c ;
}
}
}
ret = satu_2_8bit(&result[n * oc * oh * ow + c * oh * ow], oh * ow,
&ofmap[n * oc * oh * ow + c * oh * ow], r_shift_width, 1, !do_relu);
if (ret != BM_SUCCESS) goto error_release;
} // end for (int c = 0; c < oc; ++c)
} // end for (int n = 0; n < in; n++)
error_release:
free(i_fmap_pad);
free(kernel_after);
free(i_fmap_pad_ker);
free(result);
return ret;
}
int native_depthwise_fp32(const float *ifmap, const float *weight, const float *bias, float *ofmap,
int in, int ic, int ih, int iw, int kh, int kw, int dh, int dw,
int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h,
int stride_w, int ins_h, int ins_w, int ins_h_last, int ins_w_last) {
int h_after = calc_dilute_hw(ih, ins_h, ins_h_last, pad_h_t, pad_h_b);
int w_after = calc_dilute_hw(iw, ins_w, ins_w_last, pad_w_l, pad_w_r);
int kh_dilation = (kh - 1) * dh + 1, kw_dilatoin = (kw - 1) * dw + 1;
int oh = calc_output_hw(h_after, kh_dilation, stride_h);
int ow = calc_output_hw(w_after, kw_dilatoin, stride_w);
float *ifmap_after = malloc(sizeof(float) * h_after * w_after);
float *weight_dilation = malloc(sizeof(float) * kh_dilation * kw_dilatoin);
if (ifmap_after == NULL || weight_dilation == NULL) {
printf("No enough memory.\n");
free(ifmap_after);
free(weight_dilation);
return BM_ERR_NOMEM;
}
for (int n = 0; n < in; n++) {
for (int c = 0; c < ic; c++, ifmap += ih * iw, ofmap += oh * ow) {
float init_value = bias ? bias[c] : 0;
int ret_ifmap = fill_pad_fmap_fp32(ifmap, &ifmap_after, 0, pad_h_t, pad_h_b, pad_w_l, pad_w_r,
ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
int ret_weight = fill_pad_fmap_fp32(weight + c * kh * kw, &weight_dilation, 0, 0, 0, 0, 0,
dh - 1, dw - 1, 0, 0, kh, kw);
if ((ret_ifmap != BM_SUCCESS) || (ret_weight != BM_SUCCESS)) {
printf("failed to pad ifmap or weight.\n");
return BM_ERR_FAILURE;
}
for (int h = 0; h < oh; h++) {
for (int w = 0; w < ow; w++) {
int rf_h = h * stride_h, rf_w = w * stride_w;
int kh_end = math_min(kh_dilation, h_after - rf_h);
int kw_end = math_min(kw_dilatoin, w_after - rf_w);
float *rf_addr = ifmap_after + rf_h * w_after + rf_w;
float dot_product_even = 0.0, dot_product_odd = 0.0;
for (int i = 0; i < kh_end; i++) {
for (int j = 0; j < kw_end; j++) {
if ((i * kw_end + j) % 2) {
dot_product_odd += rf_addr[i * w_after + j] * weight_dilation[i * kw_dilatoin + j];
} else {
dot_product_even += rf_addr[i * w_after + j] * weight_dilation[i * kw_dilatoin + j];
}
}
}
ofmap[h * ow + w] = dot_product_even + dot_product_odd + init_value;
}
}
}
}
free(ifmap_after);
free(weight_dilation);
return BM_SUCCESS;
}
void native_conv_ref(const void *ifmap, void *ofmap, const void *weight, int input_n, int input_c,
int input_h, int input_w, int output_c, int output_h, int output_w, int groups,
int kh, int kw, int dilation_h, int dilation_w, int pad_h, int pad_w,
int stride_h, int stride_w, int flip, int using_bias, const void *bias,
int result_add) {
int kh_extent = dilation_h * (kh - 1) + 1;
int kw_extent = dilation_w * (kw - 1) + 1;
int output_h_expect = (input_h + 2 * pad_h - kh_extent) / stride_h + 1;
int output_w_expect = (input_w + 2 * pad_w - kw_extent) / stride_w + 1;
(void)output_h_expect;
(void)output_w_expect;
assert(output_h == output_h_expect && "Expect same output_h");
assert(output_w == output_w_expect && "Expect same output_w");
if (!result_add) {
memset(ofmap, 0, input_n * output_c * output_h * output_w * sizeof(float));
}
float *ifmap_f = (float *)ifmap;
float *ofmap_f = (float *)ofmap;
float *weight_f = (float *)weight;
float *bias_f = (float *)bias;
int i_shape[4];
i_shape[0] = input_n;
i_shape[1] = input_c;
i_shape[2] = input_h;
i_shape[3] = input_w;
int o_shape[4];
o_shape[0] = input_n;
o_shape[1] = output_c;
o_shape[2] = output_h;
o_shape[3] = output_w;
int k_shape[4];
k_shape[0] = output_c;
k_shape[1] = input_c / groups;
k_shape[2] = kh;
k_shape[3] = kw;
int o_g = output_c / groups;
int k_g = input_c / groups;
int o_head, k_head;
int weight_offset[4];
int in_offset[4];
int out_offset[4];
for (int n = 0; n < input_n; n++) {
for (int g = 0; g < groups; g++) {
o_head = o_g * g;
k_head = k_g * g;
for (int o = 0; o < o_g; o++) {
for (int y = 0; y < output_h; y++) {
for (int x = 0; x < output_w; x++) {
out_offset[0] = n;
out_offset[1] = o + o_head;
out_offset[2] = y;
out_offset[3] = x;
float result_init = ofmap_f[calc_offset(o_shape, out_offset)];
ofmap_f[calc_offset(o_shape, out_offset)] = 0.0f;
for (int k = 0; k < k_g; k++) {
for (int p = 0; p < kh; p++) {
for (int q = 0; q < kw; q++) {
int in_y = y * stride_h - pad_h + p * dilation_h;
int in_x = x * stride_w - pad_w + q * dilation_w;
if (in_y >= 0 && in_y < input_h && in_x >= 0 && in_x < input_w) {
weight_offset[0] = o + o_head;
weight_offset[1] = k;
if (flip) {
weight_offset[2] = (kh - 1 - p);
weight_offset[3] = (kw - 1 - q);
} else {
weight_offset[2] = p;
weight_offset[3] = q;
}
in_offset[0] = n;
in_offset[1] = k + k_head;
in_offset[2] = in_y;
in_offset[3] = in_x;
ofmap_f[calc_offset(o_shape, out_offset)] +=
ifmap_f[calc_offset(i_shape, in_offset)] *
weight_f[calc_offset(k_shape, weight_offset)];
if (k_g == 1 && kh == 1 && kw == 1) {
ofmap_f[calc_offset(o_shape, out_offset)] =
ifmap_f[calc_offset(i_shape, in_offset)] *
weight_f[calc_offset(k_shape, weight_offset)];
}
}
}
}
}
if (using_bias) {
ofmap_f[calc_offset(o_shape, out_offset)] += bias_f[o + o_head];
}
if (result_add) {
ofmap_f[calc_offset(o_shape, out_offset)] += result_init;
}
}
}
}
}
}
}
int native_fc_int8(const int8_t *L, const int8_t *R, const int16_t *B, const int16_t *Y, int *Y_ref,
int L_row_num, int L_col_num, int R_col_num, int L_sign, int R_sign, int B_sign,
int l_shift_width, int r_shift_width, int is_result_int8, int do_relu) {
const uint8_t *uL = (const uint8_t *)L;
const uint8_t *uR = (const uint8_t *)R;
const uint16_t *uB = (const uint16_t *)B;
int opd0, opd1, opd2;
int ret = BM_SUCCESS;
for (int hidx = 0; hidx < L_row_num; hidx++) {
for (int widx = 0; widx < R_col_num; widx++) {
int Y1 = 0;
int Y2 = 0;
int sum_idx = 0;
for (sum_idx = 0; sum_idx < L_col_num; sum_idx++) {
int idx_L = index_get(hidx, L_col_num, sum_idx);
int idx_R = index_get(sum_idx, R_col_num, widx);
opd0 = (L_sign) ? L[idx_L] : uL[idx_L];
opd1 = (R_sign) ? R[idx_R] : uR[idx_R];
if ((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1) {
Y1 += opd0 * opd1;
} else {
Y2 += opd0 * opd1;
}
}
sum_idx++;
if (B) {
opd2 = (B_sign) ? (int)B[widx] : (int)uB[widx];
if ((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1) {
Y1 += opd2;
} else {
Y2 += opd2;
}
sum_idx++;
}
int idx_Y = index_get(hidx, R_col_num, widx);
if (Y) {
if ((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1) {
Y1 += (Y[idx_Y] << l_shift_width);
} else {
Y2 += (Y[idx_Y] << l_shift_width);
}
}
Y_ref[idx_Y] = Y1 + Y2;
}
}
uint8_t *Yout_int8 = malloc(sizeof(int8_t) * L_row_num * R_col_num);
uint16_t *Yout_int16 = malloc(sizeof(int16_t) * L_row_num * R_col_num);
if (is_result_int8) {
ret =
satu_2_8bit(Y_ref, L_row_num * R_col_num, (int8_t *)Yout_int8, r_shift_width, 1, !do_relu);
if (ret != BM_SUCCESS) goto error_release;
fill_int_with_int8(Y_ref, (int8_t *)Yout_int8, L_row_num * R_col_num);
} else {
ret = satu_2_16bit(Y_ref, L_row_num * R_col_num, (int16_t *)Yout_int16, r_shift_width, 1,
!do_relu);
if (ret != BM_SUCCESS) goto error_release;
fill_int_with_int16(Y_ref, (int16_t *)Yout_int16, L_row_num * R_col_num);
}
error_release:
free(Yout_int8);
free(Yout_int16);
return ret;
}
int native_pooling_ave_int8(const int8_t *i_fmap, const void *weight, const int16_t *bias,
int8_t *o_fmap, int input_n, int input_c, int input_h, int input_w,
int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
int stride_h, int stride_w, int ins_h, int ins_w, int ins_h_last,
int ins_w_last, int input_sign, int satu_sign, int r_shift_width,
int const_weight) {
if (kh * kw <= 0) return BM_ERR_INVALID_ARGUMENT;
int *avg_pooling_mac_a = (int *)malloc(kh * kw * sizeof(int));
int *avg_pooling_mac_b = (int *)malloc(kh * kw * sizeof(int));
uint8_t avg_const_weight = *(uint8_t *)weight;
const int8_t *weight_arr = weight;
int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
int output_h = calc_output_hw(h_after, kh, stride_h);
int output_w = calc_output_hw(w_after, kw, stride_w);
int8_t *i_fmap_pad = NULL;
for (int n = 0; n < input_n; n++) {
if (const_weight == 0) weight_arr = weight;
for (int c = 0; c < input_c; ++c) {
fill_pad_fmap_int8(i_fmap, &i_fmap_pad, 0, pad_w_l, pad_w_r, pad_h_t, pad_h_b, ins_h, ins_w,
ins_h_last, ins_w_last, input_h, input_w);
for (int ph = 0; ph < output_h; ++ph) {
for (int pw = 0; pw < output_w; ++pw) {
int hstart = ph * stride_h;
int wstart = pw * stride_w;
int pool_index = index_get(ph, output_w, pw);
int mac_index = 0;
int avg_pool_result;
for (int h = 0; h < kh; h++) {
for (int w = 0; w < kw; w++) {
int index = index_get((hstart + h), w_after, (w + wstart));
mac_index = index_get(h, kw, w);
avg_pooling_mac_a[mac_index] =
input_sign ? i_fmap_pad[index] : (uint8_t)(i_fmap_pad[index]);
avg_pooling_mac_b[mac_index] =
const_weight ? avg_const_weight : weight_arr[mac_index];
}
}
inner_product(avg_pooling_mac_a, avg_pooling_mac_b, kh * kw, &avg_pool_result);
if (bias) {
avg_pool_result += bias[c];
}
int ret = satu_2_8bit(&avg_pool_result, sizeof(int8_t), o_fmap + pool_index,
r_shift_width, 1, satu_sign);
if (ret != BM_SUCCESS) {
free(i_fmap_pad);
free(avg_pooling_mac_a);
free(avg_pooling_mac_b);
return BM_ERR_INVALID_ARGUMENT;
}
}
}
i_fmap += input_w * input_h;
if (const_weight == 0) weight_arr += kh * kw;
o_fmap += output_w * output_h;
}
}
free(i_fmap_pad);
free(avg_pooling_mac_a);
free(avg_pooling_mac_b);
return BM_SUCCESS;
}
int native_pooling_max_int8(const int8_t *i_fmap, int8_t *o_fmap, int input_n, int input_c,
int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b,
int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h,
int ins_w, int ins_h_last, int ins_w_last, int input_sign) {
if (ins_h != 0 || ins_w != 0 || ins_h_last != 0 || ins_w_last != 0)
return BM_ERR_INVALID_ARGUMENT;
int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
int output_h = calc_output_hw(h_after, kh, stride_h);
int output_w = calc_output_hw(w_after, kw, stride_w);
const int max_init = input_sign ? -128 : 0;
int8_t *i_fmap_pad = NULL;
for (int nc = 0; nc < input_n * input_c; nc++) {
fill_pad_fmap_int8(i_fmap, &i_fmap_pad, max_init, pad_w_l, pad_w_r, pad_h_t, pad_h_b, 0, 0, 0,
0, input_h, input_w);
for (int ph = 0; ph < output_h; ++ph) {
for (int pw = 0; pw < output_w; ++pw) {
int hstart = ph * stride_h;
int wstart = pw * stride_w;
int pool_index = index_get(ph, output_w, pw);
int max = max_init;
for (int h = 0; h < kh; h++) {
for (int w = 0; w < kw; w++) {
int index = index_get((hstart + h), (input_w + pad_w_l + pad_w_r), (w + wstart));
int val = input_sign ? i_fmap_pad[index] : (uint8_t)i_fmap_pad[index];
max = (val > max) ? val : max;
}
}
o_fmap[pool_index] = max;
}
}
i_fmap += input_w * input_h;
o_fmap += output_w * output_h;
}
free(i_fmap_pad);
return BM_SUCCESS;
}
int native_pooling_max_fp32(const float *ifmap, float *ofmap, int input_n, int input_c, int input_h,
int input_w, int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l,
int pad_w_r, int stride_h, int stride_w, int ins_h, int ins_w,
int ins_h_last, int ins_w_last) {
int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
int output_h = calc_output_hw(h_after, kh, stride_h);
int output_w = calc_output_hw(w_after, kw, stride_w);
float *ifmap_after = malloc(sizeof(float) * h_after * w_after);
if (ifmap_after == NULL) {
printf("No enough memory[h_after, w_after]: [%u, %u].\n", h_after, w_after);
return BM_ERR_NOMEM;
}
for (int n = 0; n < input_n; n++) {
for (int c = 0; c < input_c; c++) {
int ret = fill_pad_fmap_fp32(ifmap, &ifmap_after, -FLT_MAX, pad_h_t, pad_h_b, pad_w_l,
pad_w_r, ins_h, ins_w, ins_h_last, ins_w_last, input_h, input_w);
if (ret != BM_SUCCESS) {
printf("Failed to pad input fmap.\n");
free(ifmap_after);
return BM_ERR_FAILURE;
}
for (int h = 0; h < output_h; h++) {
for (int w = 0; w < output_w; w++) {
int rf_h = h * stride_h, rf_w = w * stride_w;
int kh_end = math_min(kh, h_after - rf_h);
int kw_end = math_min(kw, w_after - rf_w);
float *rf_addr = ifmap_after + rf_h * w_after + rf_w;
float max_val = -FLT_MAX;
for (int i = 0; i < kh_end; i++) {
for (int j = 0; j < kw_end; j++) {
max_val = math_max(rf_addr[i * w_after + j], max_val);
}
}
ofmap[h * output_w + w] = max_val;
}
}
ifmap += input_h * input_w;
ofmap += output_h * output_w;
}
}
free(ifmap_after);
return BM_SUCCESS;
}
int native_pooling_avg_fp32(const float *ifmap, float *ofmap, int input_n, int input_c, int input_h,
int input_w, int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l,
int pad_w_r, int stride_h, int stride_w, int ins_h, int ins_w,
int ins_h_last, int ins_w_last, float avg_pooling_const) {
int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
int output_h = calc_output_hw(h_after, kh, stride_h);
int output_w = calc_output_hw(w_after, kw, stride_w);
float *ifmap_after = malloc(sizeof(float) * h_after * w_after);
if (ifmap_after == NULL) {
printf("No enough memory[h_after, w_after]: [%u, %u].\n", h_after, w_after);
return BM_ERR_NOMEM;
}
for (int n = 0; n < input_n; n++) {
for (int c = 0; c < input_c; c++) {
int ret = fill_pad_fmap_fp32(ifmap, &ifmap_after, 0, pad_h_t, pad_h_b, pad_w_l, pad_w_r,
ins_h, ins_w, ins_h_last, ins_w_last, input_h, input_w);
if (ret != BM_SUCCESS) {
printf("Failed to pad input fmap.\n");
free(ifmap_after);
return BM_ERR_FAILURE;
}
for (int h = 0; h < output_h; h++) {
for (int w = 0; w < output_w; w++) {
int rf_h = h * stride_h, rf_w = w * stride_w;
int kh_end = math_min(kh, h_after - rf_h);
int kw_end = math_min(kw, w_after - rf_w);
float *rf_addr = ifmap_after + rf_h * w_after + rf_w;
float dot_product_even = 0.0, dot_product_odd = 0.0;
for (int i = 0; i < kh_end; i++) {
for (int j = 0; j < kw_end; j++) {
if ((i * kw_end + j) % 2) {
dot_product_odd += rf_addr[i * w_after + j] * avg_pooling_const;
} else {
dot_product_even += rf_addr[i * w_after + j] * avg_pooling_const;
}
}
}
ofmap[h * output_w + w] = dot_product_even + dot_product_odd;
}
}
ifmap += input_h * input_w;
ofmap += output_h * output_w;
}
}
free(ifmap_after);
return BM_SUCCESS;
}
void native_pooling_forward_max(const float *bottom_data, float *top_data, int *mask_data,
const int count, const int num, const int channels,
const int height, const int width, const int pooled_height,
const int pooled_width, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_h,
const int pad_w) {
(void)num;
for (int index = 0; index < count; ++index) {
const int pw = index % pooled_width;
const int ph = (index / pooled_width) % pooled_height;
const int c = (index / pooled_width / pooled_height) % channels;
const int n = index / pooled_width / pooled_height / channels;
int hstart = ph * stride_h - pad_h;
int wstart = pw * stride_w - pad_w;
const int hend = math_min(hstart + kernel_h, height);
const int wend = math_min(wstart + kernel_w, width);
hstart = math_max(hstart, 0);
wstart = math_max(wstart, 0);
float maxval = -FLT_MAX;
int maxidx = -1;
const float *const bottom_slice = bottom_data + (n * channels + c) * height * width;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
if (bottom_slice[h * width + w] > maxval) {
maxidx = h * width + w;
maxval = bottom_slice[maxidx];
}
}
}
top_data[index] = maxval;
mask_data[index] = maxidx;
}
}
void native_pooling_forward_ave(const float *bottom_data, float *top_data, const int count,
const int num, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width,
const int kernel_h, const int kernel_w, const int stride_h,
const int stride_w, const int pad_h, const int pad_w) {
(void)num;
for (int index = 0; index < count; ++index) {
const int pw = index % pooled_width;
const int ph = (index / pooled_width) % pooled_height;
const int c = (index / pooled_width / pooled_height) % channels;
const int n = index / pooled_width / pooled_height / channels;
int hstart = ph * stride_h - pad_h;
int wstart = pw * stride_w - pad_w;
int hend = math_min(hstart + kernel_h, height + pad_h);
int wend = math_min(wstart + kernel_w, width + pad_w);
const int pool_size = (hend - hstart) * (wend - wstart);
hstart = math_max(hstart, 0);
wstart = math_max(wstart, 0);
hend = math_min(hend, height);
wend = math_min(wend, width);
float aveval = 0;
const float *const bottom_slice = bottom_data + (n * channels + c) * height * width;
for (int h = hstart; h < hend; ++h) {
for (int w = wstart; w < wend; ++w) {
aveval += bottom_slice[h * width + w];
}
}
top_data[index] = aveval / pool_size;
}
}
int satu_2_8bit(const int *pBuff, int len, int8_t *pByteOut, int rshiftbits, int round_floor,
int sign_unsign) {
if (rshiftbits < 0) return BM_ERR_INVALID_ARGUMENT;
int temp;
int satu_max = sign_unsign ? 127 : 255;
int satu_min = sign_unsign ? -128 : 0;
if (rshiftbits == 0) {
for (int ii = 0; ii < len; ii++) {
temp = (pBuff[ii] > satu_max) ? satu_max : ((pBuff[ii] < satu_min) ? satu_min : pBuff[ii]);
memcpy(pByteOut + ii, &temp, 1);
}
} else { // rshiftbits>0
for (int ii = 0; ii < len; ii++) {
if (round_floor == 1)
temp = ((pBuff[ii] >> (rshiftbits - 1)) + 1) >> 1;
else
temp = pBuff[ii] >> rshiftbits;
temp = (temp > satu_max) ? satu_max : ((temp < satu_min) ? satu_min : temp);
memcpy(pByteOut + ii, &temp, 1);
}
}
return BM_SUCCESS;
}
int satu_2_16bit(const int *pBuff, int len, short *pByteOut, int rshiftbits, int round_floor,
int sign_unsign) {
if (rshiftbits < 0) return BM_ERR_INVALID_ARGUMENT;
int ii;
int temp;
int satu_max = sign_unsign ? 32767 : 65535;
int satu_min = sign_unsign ? -32768 : 0;
if (rshiftbits == 0) {
for (ii = 0; ii < len; ii++) {
temp = (pBuff[ii] > satu_max) ? satu_max : ((pBuff[ii] < satu_min) ? satu_min : pBuff[ii]);
memcpy(pByteOut + ii, &temp, 2);
}
} else { // rshiftbits>0
for (ii = 0; ii < len; ii++) {
if (round_floor == 1)
temp = ((pBuff[ii] >> (rshiftbits - 1)) + 1) >> 1;
else
temp = pBuff[ii] >> rshiftbits;
temp = (temp > satu_max) ? satu_max : ((temp < satu_min) ? satu_min : temp);
memcpy(pByteOut + ii, &temp, 2);
}
}
return BM_SUCCESS;
}

View File

@ -0,0 +1,477 @@
/**
* plz refer [git](https://github.com/xiezhq-hermann/atan_lookup)
* input range is `all real numbers` and output range is -pi/2 < x < pi/2,
* you can refer [here](https://www.mathopenref.com/arctan.html) for more
* details
*/
//
// xiezhq@shanghaitech.edu.cn && wanghe@shanghaitech.edu.cn
/* Reference:
[1] Abhisek Ukil, Vishal H Shah, Bernhard Deck,
"Fast Computation of arctangent Functions for Embedded Applications: A
Comparative Analysis" IEEE International Symposium on Industrial Electronics,
Pages: 1206 - 1211, DOI: 10.1109/ISIE.2011.5984330, 2011
[2] Sreeraman Rajan, Sichun Wang, Robert Inkol, and Alain Joyal
"Efficient Approximations for the Arctangent Function"
IEEE SIGNAL PROCESSING MAGAZINE [108] MAY 2006
*/
#include <cvimath_internal.h>
#include <test_cvikernel_util.h>
#define OUT
#define IN
#include <cfloat>
#include <iomanip>
#include <iostream>
#include <map>
#include <random>
#include <string>
//#define DBG
using namespace std;
#if 0
double atan_double(double x) {
/*
More precise look-up table is used for higher accuracy
*/
if (x >= 0) {
if (x <= 1) {
int index = round(x * 100);
return (LUT_d[index] + (x * 100 - index) * (LUT_d[index + 1] - LUT_d[index]));
} else {
double re_x = 1 / x;
int index = round(re_x * 100);
return (M_PI_2 - (LUT_d[index] + (re_x * 100 - index) * (LUT_d[index + 1] - LUT_d[index])));
// No recursive is better here
}
} else {
if (x >= -1) {
double abs_x = -x;
int index = round(abs_x * 100);
return -(LUT_d[index] + (abs_x * 100 - index) * (LUT_d[index + 1] - LUT_d[index]));
} else {
double re_x = 1 / (-x);
int index = round(re_x * 100);
return (LUT_d[index] + (re_x * 100 - index) * (LUT_d[index+1] - LUT_d[index])) - M_PI_2;
}
}
}
#endif
/**
* pre_data means we test fixed pattern, it should be same sa lut
*/
enum TEST_MODE {
PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare
DATA_COMPARE_ACCURACY, // generate \range_start to \range_end value that
// check epsilon
DATA_COMPARE_U8, // generate \range_start to \range_end value that check
// epsilon, result bf16->uint8_t
TEST_MODE_MAX,
};
static TEST_MODE mode;
static uint16_t test_pattern[] = {
0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52, 0x3A6C, 0x3A83, 0x3A90,
0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF, 0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17,
0x3B1D, 0x3B24, 0x3B2A, 0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65,
0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90, 0x3B93, 0x3B97, 0x3B9A,
0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE, 0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1,
0x3BC5, 0x3BC8, 0x3BCB, 0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9,
0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03, 0x3C05, 0x3C06, 0x3C08,
0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12, 0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C,
0x3C1D, 0x3C1F, 0x3C21, 0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F,
0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E, 0x3C40, 0x3C41, 0x3C43,
0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D, 0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57,
0x3C58, 0x3C5A, 0x3C5C, 0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A,
0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79, 0x3C7B, 0x3C7C, 0x3C7E,
0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84, 0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89,
0x3C8A, 0x3C8A, 0x3C8B, 0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93,
0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A, 0x3C9B, 0x3C9C, 0x3C9C,
0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1, 0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6,
0x3CA7, 0x3CA8, 0x3CA9, 0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0,
0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8, 0x3CB8, 0x3CB9, 0x3CBA,
0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF, 0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4,
0x3CC5, 0x3CC5, 0x3CC6, 0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE,
0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5, 0x3CD6, 0x3CD7, 0x3CD7,
0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC, 0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1,
0x3CE2, 0x3CE3, 0x3CE4, 0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB,
0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2, 0x3CF3, 0x3CF4, 0x3CF5,
0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA, 0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF,
0x3D00, 0x3D00, 0x3D01, 0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04,
0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08, 0x3D08, 0x3D09, 0x3D09,
0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C, 0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E,
0x3D0F, 0x3D0F, 0x3D0F, 0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13,
0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17, 0x3D17, 0x3D18, 0x3D18,
0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A, 0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D,
0x3D1D, 0x3D1E, 0x3D1E, 0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22,
0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25, 0x3D26, 0x3D26, 0x3D27,
0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29, 0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C,
0x3D2C, 0x3D2C, 0x3D2D, 0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31,
0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34, 0x3D35, 0x3D35, 0x3D35,
0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38, 0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A,
0x3D3B, 0x3D3B, 0x3D3C, 0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F,
0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43, 0x3D43, 0x3D44, 0x3D44,
0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47, 0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49,
0x3D4A, 0x3D4A, 0x3D4A, 0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E,
0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52, 0x3D52, 0x3D53, 0x3D53,
0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55, 0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58,
0x3D58, 0x3D59, 0x3D59, 0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D,
0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60, 0x3D61, 0x3D61, 0x3D62,
0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64, 0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67,
0x3D67, 0x3D67, 0x3D68, 0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C,
0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F, 0x3D70, 0x3D70, 0x3D70,
0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73, 0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75,
0x3D76, 0x3D76, 0x3D77, 0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A,
0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E, 0x3D7E, 0x3D7F, 0x3D7F,
0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82,
0x3D82, 0x3D82, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85,
0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D87, 0x3D87, 0x3D87,
0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89,
0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C,
0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E,
0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91,
0x3D91, 0x3D91, 0x3D91, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93,
0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95, 0x3D95, 0x3D96, 0x3D96,
0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98,
0x3D98, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B,
0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9D, 0x3D9D, 0x3D9D,
0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0,
0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2,
0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4,
0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7,
0x3DA7, 0x3DA7, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9,
0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAC, 0x3DAC,
0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE,
0x3DAE, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1,
0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3,
0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6,
0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8,
0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBB,
0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD,
0x3DBD, 0x3DBD, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF,
0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC2, 0x3DC2, 0x3DC2,
0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4,
0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7,
0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9, 0x3DC9, 0x3DC9, 0x3DC9,
0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC,
0x3DCC, 0x3DCC, 0x3DCC, 0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5,
0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE, 0x3DDF, 0x3DE0, 0x3DE1,
0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5,
};
static uint16_t golden_bf16[] = {
0x0, 0x38d2, 0x3952, 0x399d, 0x39d2, 0x3a03, 0x3a1d, 0x3a38, 0x3a52, 0x3a6c, 0x3a83, 0x3a90,
0x3a9d, 0x3aaa, 0x3ab8, 0x3ac5, 0x3ad2, 0x3adf, 0x3aec, 0x3afa, 0x3b03, 0x3b0a, 0x3b10, 0x3b17,
0x3b1d, 0x3b24, 0x3b2a, 0x3b31, 0x3b38, 0x3b3e, 0x3b45, 0x3b4c, 0x3b52, 0x3b59, 0x3b5f, 0x3b65,
0x3b6c, 0x3b72, 0x3b7a, 0x3b80, 0x3b83, 0x3b86, 0x3b8a, 0x3b8d, 0x3b90, 0x3b93, 0x3b97, 0x3b9a,
0x3b9d, 0x3ba1, 0x3ba4, 0x3ba7, 0x3baa, 0x3bae, 0x3bb1, 0x3bb4, 0x3bb8, 0x3bbb, 0x3bbe, 0x3bc1,
0x3bc5, 0x3bc8, 0x3bcb, 0x3bce, 0x3bd2, 0x3bd6, 0x3bd8, 0x3bdc, 0x3bdf, 0x3be2, 0x3be6, 0x3be9,
0x3bec, 0x3bef, 0x3bf2, 0x3bf6, 0x3bf9, 0x3bfc, 0x3c00, 0x3c01, 0x3c03, 0x3c05, 0x3c06, 0x3c08,
0x3c0a, 0x3c0b, 0x3c0d, 0x3c0f, 0x3c10, 0x3c12, 0x3c13, 0x3c15, 0x3c17, 0x3c18, 0x3c1a, 0x3c1c,
0x3c1d, 0x3c1f, 0x3c21, 0x3c22, 0x3c24, 0x3c25, 0x3c27, 0x3c29, 0x3c2a, 0x3c2c, 0x3c2e, 0x3c2f,
0x3c31, 0x3c33, 0x3c34, 0x3c36, 0x3c38, 0x3c39, 0x3c3b, 0x3c3c, 0x3c3e, 0x3c40, 0x3c41, 0x3c43,
0x3c45, 0x3c46, 0x3c48, 0x3c4a, 0x3c4b, 0x3c4d, 0x3c4e, 0x3c50, 0x3c52, 0x3c53, 0x3c55, 0x3c57,
0x3c58, 0x3c5a, 0x3c5c, 0x3c5d, 0x3c5f, 0x3c60, 0x3c62, 0x3c64, 0x3c66, 0x3c68, 0x3c69, 0x3c6a,
0x3c6c, 0x3c6e, 0x3c70, 0x3c71, 0x3c72, 0x3c74, 0x3c76, 0x3c78, 0x3c79, 0x3c7b, 0x3c7c, 0x3c7e,
0x3c80, 0x3c81, 0x3c81, 0x3c82, 0x3c83, 0x3c84, 0x3c85, 0x3c86, 0x3c86, 0x3c87, 0x3c88, 0x3c89,
0x3c8a, 0x3c8a, 0x3c8b, 0x3c8c, 0x3c8d, 0x3c8e, 0x3c8f, 0x3c8f, 0x3c90, 0x3c91, 0x3c92, 0x3c93,
0x3c93, 0x3c94, 0x3c95, 0x3c96, 0x3c97, 0x3c98, 0x3c98, 0x3c99, 0x3c9a, 0x3c9b, 0x3c9c, 0x3c9c,
0x3c9d, 0x3c9e, 0x3c9f, 0x3ca0, 0x3ca1, 0x3ca1, 0x3ca2, 0x3ca3, 0x3ca4, 0x3ca5, 0x3ca5, 0x3ca6,
0x3ca7, 0x3ca8, 0x3ca9, 0x3caa, 0x3caa, 0x3cab, 0x3cac, 0x3cad, 0x3cae, 0x3cae, 0x3caf, 0x3cb0,
0x3cb1, 0x3cb2, 0x3cb3, 0x3cb3, 0x3cb4, 0x3cb5, 0x3cb6, 0x3cb7, 0x3cb8, 0x3cb8, 0x3cb9, 0x3cba,
0x3cbb, 0x3cbc, 0x3cbc, 0x3cbd, 0x3cbe, 0x3cbf, 0x3cc0, 0x3cc1, 0x3cc1, 0x3cc2, 0x3cc3, 0x3cc4,
0x3cc5, 0x3cc5, 0x3cc6, 0x3cc7, 0x3cc8, 0x3cc9, 0x3cca, 0x3cca, 0x3ccb, 0x3ccc, 0x3ccd, 0x3cce,
0x3cce, 0x3ccf, 0x3cd0, 0x3cd1, 0x3cd2, 0x3cd3, 0x3cd3, 0x3cd4, 0x3cd5, 0x3cd6, 0x3cd7, 0x3cd7,
0x3cd8, 0x3cd9, 0x3cda, 0x3cdb, 0x3cdc, 0x3cdc, 0x3cdd, 0x3cde, 0x3cdf, 0x3ce0, 0x3ce0, 0x3ce1,
0x3ce2, 0x3ce3, 0x3ce4, 0x3ce5, 0x3ce5, 0x3ce6, 0x3ce7, 0x3ce8, 0x3ce9, 0x3ce9, 0x3cea, 0x3ceb,
0x3cec, 0x3ced, 0x3cee, 0x3cee, 0x3cef, 0x3cf0, 0x3cf1, 0x3cf2, 0x3cf2, 0x3cf3, 0x3cf4, 0x3cf5,
0x3cf6, 0x3cf7, 0x3cf7, 0x3cf8, 0x3cf9, 0x3cfa, 0x3cfb, 0x3cfb, 0x3cfc, 0x3cfd, 0x3cfe, 0x3cff,
0x3d00, 0x3d00, 0x3d01, 0x3d01, 0x3d01, 0x3d02, 0x3d02, 0x3d03, 0x3d03, 0x3d03, 0x3d04, 0x3d04,
0x3d05, 0x3d05, 0x3d06, 0x3d06, 0x3d06, 0x3d07, 0x3d07, 0x3d08, 0x3d08, 0x3d08, 0x3d09, 0x3d09,
0x3d0a, 0x3d0a, 0x3d0a, 0x3d0b, 0x3d0b, 0x3d0c, 0x3d0c, 0x3d0c, 0x3d0d, 0x3d0d, 0x3d0e, 0x3d0e,
0x3d0f, 0x3d0f, 0x3d0f, 0x3d10, 0x3d10, 0x3d11, 0x3d11, 0x3d11, 0x3d12, 0x3d12, 0x3d13, 0x3d13,
0x3d13, 0x3d14, 0x3d14, 0x3d15, 0x3d15, 0x3d16, 0x3d16, 0x3d16, 0x3d17, 0x3d17, 0x3d18, 0x3d18,
0x3d18, 0x3d19, 0x3d19, 0x3d1a, 0x3d1a, 0x3d1a, 0x3d1b, 0x3d1b, 0x3d1c, 0x3d1c, 0x3d1c, 0x3d1d,
0x3d1d, 0x3d1e, 0x3d1e, 0x3d1f, 0x3d1f, 0x3d1f, 0x3d20, 0x3d20, 0x3d21, 0x3d21, 0x3d21, 0x3d22,
0x3d22, 0x3d23, 0x3d23, 0x3d23, 0x3d24, 0x3d24, 0x3d25, 0x3d25, 0x3d25, 0x3d26, 0x3d26, 0x3d27,
0x3d27, 0x3d28, 0x3d28, 0x3d28, 0x3d29, 0x3d29, 0x3d2a, 0x3d2a, 0x3d2a, 0x3d2b, 0x3d2b, 0x3d2c,
0x3d2c, 0x3d2c, 0x3d2d, 0x3d2d, 0x3d2e, 0x3d2e, 0x3d2e, 0x3d2f, 0x3d2f, 0x3d30, 0x3d30, 0x3d31,
0x3d31, 0x3d31, 0x3d32, 0x3d32, 0x3d33, 0x3d33, 0x3d33, 0x3d34, 0x3d34, 0x3d35, 0x3d35, 0x3d35,
0x3d36, 0x3d36, 0x3d37, 0x3d37, 0x3d38, 0x3d38, 0x3d38, 0x3d39, 0x3d39, 0x3d3a, 0x3d3a, 0x3d3a,
0x3d3b, 0x3d3b, 0x3d3c, 0x3d3c, 0x3d3c, 0x3d3d, 0x3d3d, 0x3d3e, 0x3d3e, 0x3d3e, 0x3d3f, 0x3d3f,
0x3d40, 0x3d40, 0x3d41, 0x3d41, 0x3d41, 0x3d42, 0x3d42, 0x3d43, 0x3d43, 0x3d43, 0x3d44, 0x3d44,
0x3d45, 0x3d45, 0x3d45, 0x3d46, 0x3d46, 0x3d47, 0x3d47, 0x3d47, 0x3d48, 0x3d48, 0x3d49, 0x3d49,
0x3d4a, 0x3d4a, 0x3d4a, 0x3d4b, 0x3d4b, 0x3d4c, 0x3d4c, 0x3d4c, 0x3d4d, 0x3d4d, 0x3d4e, 0x3d4e,
0x3d4e, 0x3d4f, 0x3d4f, 0x3d50, 0x3d50, 0x3d50, 0x3d51, 0x3d51, 0x3d52, 0x3d52, 0x3d53, 0x3d53,
0x3d53, 0x3d54, 0x3d54, 0x3d55, 0x3d55, 0x3d55, 0x3d56, 0x3d56, 0x3d57, 0x3d57, 0x3d57, 0x3d58,
0x3d58, 0x3d59, 0x3d59, 0x3d59, 0x3d5a, 0x3d5a, 0x3d5b, 0x3d5b, 0x3d5c, 0x3d5c, 0x3d5c, 0x3d5d,
0x3d5d, 0x3d5e, 0x3d5e, 0x3d5e, 0x3d5f, 0x3d5f, 0x3d60, 0x3d60, 0x3d60, 0x3d60, 0x3d60, 0x3d61,
0x3d61, 0x3d62, 0x3d62, 0x3d62, 0x3d63, 0x3d63, 0x3d64, 0x3d64, 0x3d64, 0x3d65, 0x3d65, 0x3d66,
0x3d66, 0x3d66, 0x3d67, 0x3d67, 0x3d68, 0x3d68, 0x3d68, 0x3d69, 0x3d69, 0x3d6a, 0x3d6a, 0x3d6b,
0x3d6b, 0x3d6b, 0x3d6c, 0x3d6c, 0x3d6d, 0x3d6d, 0x3d6d, 0x3d6e, 0x3d6e, 0x3d6f, 0x3d6f, 0x3d6f,
0x3d70, 0x3d70, 0x3d71, 0x3d71, 0x3d71, 0x3d72, 0x3d72, 0x3d73, 0x3d73, 0x3d74, 0x3d74, 0x3d74,
0x3d75, 0x3d75, 0x3d76, 0x3d76, 0x3d76, 0x3d77, 0x3d77, 0x3d78, 0x3d78, 0x3d78, 0x3d79, 0x3d79,
0x3d7a, 0x3d7a, 0x3d7a, 0x3d7b, 0x3d7b, 0x3d7c, 0x3d7c, 0x3d7d, 0x3d7d, 0x3d7d, 0x3d7e, 0x3d7e,
0x3d7f, 0x3d7f, 0x3d7f, 0x3d7f, 0x3d81, 0x3d81, 0x3d81, 0x3d81, 0x3d81, 0x3d82, 0x3d82, 0x3d82,
0x3d82, 0x3d82, 0x3d83, 0x3d83, 0x3d83, 0x3d83, 0x3d83, 0x3d84, 0x3d84, 0x3d84, 0x3d84, 0x3d85,
0x3d85, 0x3d85, 0x3d85, 0x3d85, 0x3d86, 0x3d86, 0x3d86, 0x3d86, 0x3d86, 0x3d87, 0x3d87, 0x3d87,
0x3d87, 0x3d87, 0x3d88, 0x3d88, 0x3d88, 0x3d88, 0x3d88, 0x3d89, 0x3d89, 0x3d89, 0x3d89, 0x3d89,
0x3d8a, 0x3d8a, 0x3d8a, 0x3d8a, 0x3d8a, 0x3d8b, 0x3d8b, 0x3d8b, 0x3d8b, 0x3d8b, 0x3d8c, 0x3d8c,
0x3d8c, 0x3d8c, 0x3d8c, 0x3d8d, 0x3d8d, 0x3d8d, 0x3d8d, 0x3d8e, 0x3d8e, 0x3d8e, 0x3d8e, 0x3d8e,
0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d90, 0x3d90, 0x3d90, 0x3d90, 0x3d90, 0x3d91, 0x3d91,
0x3d91, 0x3d91, 0x3d91, 0x3d92, 0x3d92, 0x3d92, 0x3d92, 0x3d92, 0x3d93, 0x3d93, 0x3d93, 0x3d93,
0x3d93, 0x3d94, 0x3d94, 0x3d94, 0x3d94, 0x3d94, 0x3d95, 0x3d95, 0x3d95, 0x3d95, 0x3d96, 0x3d96,
0x3d96, 0x3d96, 0x3d96, 0x3d97, 0x3d97, 0x3d97, 0x3d97, 0x3d97, 0x3d98, 0x3d98, 0x3d98, 0x3d98,
0x3d98, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d9a,
0x3d9a, 0x3d9a, 0x3d9a, 0x3d9a, 0x3d9b, 0x3d9b, 0x3d9b, 0x3d9b, 0x3d9b, 0x3d9c, 0x3d9c, 0x3d9c,
0x3d9c, 0x3d9c, 0x3d9d, 0x3d9d, 0x3d9d, 0x3d9d, 0x3d9e, 0x3d9e, 0x3d9e, 0x3d9e, 0x3d9e, 0x3d9f,
0x3d9f, 0x3d9f, 0x3d9f, 0x3d9f, 0x3da0, 0x3da0, 0x3da0, 0x3da0, 0x3da0, 0x3da1, 0x3da1, 0x3da1,
0x3da1, 0x3da1, 0x3da2, 0x3da2, 0x3da2, 0x3da2, 0x3da2, 0x3da3, 0x3da3, 0x3da3, 0x3da3, 0x3da3,
0x3da4, 0x3da4, 0x3da4, 0x3da4, 0x3da4, 0x3da5, 0x3da5, 0x3da5, 0x3da5, 0x3da6, 0x3da6, 0x3da6,
0x3da6, 0x3da6, 0x3da7, 0x3da7, 0x3da7, 0x3da7, 0x3da7, 0x3da8, 0x3da8, 0x3da8, 0x3da8, 0x3da8,
0x3da9, 0x3da9, 0x3da9, 0x3da9, 0x3da9, 0x3daa, 0x3daa, 0x3daa, 0x3daa, 0x3daa, 0x3dab, 0x3dab,
0x3dab, 0x3dab, 0x3dab, 0x3dac, 0x3dac, 0x3dac, 0x3dac, 0x3dac, 0x3dad, 0x3dad, 0x3dad, 0x3dad,
0x3dad, 0x3daf, 0x3daf, 0x3daf, 0x3daf, 0x3db0, 0x3db0, 0x3db0, 0x3db0, 0x3db0, 0x3db1, 0x3db1,
0x3db1, 0x3db1, 0x3db1, 0x3db2, 0x3db2, 0x3db2, 0x3db2, 0x3db2, 0x3db3, 0x3db3, 0x3db3, 0x3db3,
0x3db3, 0x3db4, 0x3db4, 0x3db4, 0x3db4, 0x3db4, 0x3db5, 0x3db5, 0x3db5, 0x3db5, 0x3db5, 0x3db6,
0x3db6, 0x3db6, 0x3db6, 0x3db6, 0x3db7, 0x3db7, 0x3db7, 0x3db7, 0x3db8, 0x3db8, 0x3db8, 0x3db8,
0x3db8, 0x3db9, 0x3db9, 0x3db9, 0x3db9, 0x3db9, 0x3dba, 0x3dba, 0x3dba, 0x3dba, 0x3dba, 0x3dbb,
0x3dbb, 0x3dbb, 0x3dbb, 0x3dbb, 0x3dbc, 0x3dbc, 0x3dbc, 0x3dbc, 0x3dbc, 0x3dbd, 0x3dbd, 0x3dbd,
0x3dbd, 0x3dbd, 0x3dbe, 0x3dbe, 0x3dbe, 0x3dbe, 0x3dbe, 0x3dbf, 0x3dbf, 0x3dbf, 0x3dbf, 0x3dbf,
0x3dc0, 0x3dc0, 0x3dc0, 0x3dc0, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1,
0x3dc1, 0x3dc1, 0x3dc2, 0x3dc2, 0x3dc2, 0x3dc2, 0x3dc2, 0x3dc3, 0x3dc3, 0x3dc3, 0x3dc3, 0x3dc3,
0x3dc4, 0x3dc4, 0x3dc4, 0x3dc4, 0x3dc4, 0x3dc5, 0x3dc5, 0x3dc5, 0x3dc5, 0x3dc5, 0x3dc6, 0x3dc6,
0x3dc6, 0x3dc6, 0x3dc6, 0x3dc7, 0x3dc7, 0x3dc7, 0x3dc7, 0x3dc7, 0x3dc8, 0x3dc8, 0x3dc8, 0x3dc8,
0x3dc9, 0x3dc9, 0x3dc9, 0x3dc9, 0x3dc9, 0x3dca, 0x3dca, 0x3dca, 0x3dca, 0x3dca, 0x3dcb, 0x3dcb,
0x3dcb, 0x3dcb, 0x3dcb, 0x3dcc, 0x3dcd, 0x3dce, 0x3dcf, 0x3dd0, 0x3dd1, 0x3dd2, 0x3dd3, 0x3dd4,
0x3dd5, 0x3dd6, 0x3dd7, 0x3dd8, 0x3dd9, 0x3dda, 0x3ddb, 0x3ddc, 0x3ddd, 0x3dde, 0x3ddf, 0x3de0,
0x3de1, 0x3de2, 0x3de3, 0x3de4,
};
// <! gen atan f(x) = atan(x)
static double _gen_atan(float i) { return atan(i); }
static void tl_lut_ref(uint16_t *ofmap, uint16_t *ifmap, cvk_tl_shape_t ifmap_shape) {
assert(ofmap);
for (uint32_t i = 0; i < tl_shape_size(&ifmap_shape); i++) {
float f = convert_bf16_fp32(ifmap[i]);
double v = _gen_atan(f);
ofmap[i] = convert_fp32_bf16(v);
if (mode == PRE_DATA_COMPARE_FIX) {
ofmap[i] = golden_bf16[i];
} else if (mode == DATA_COMPARE_U8) {
ofmap[i] = (uint8_t)convert_bf16_s8(ofmap[i]);
}
}
}
static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint16_t *ifmap, uint64_t ifmap_size,
float epsilon) {
uint64_t size = ifmap_size;
for (uint64_t i = 0; i < size; i++) {
bool is_close;
uint16_t ref = ref_data[i];
uint16_t ofmap_data_bf16;
float ref_f;
float ofmap_data_f;
ref_f = convert_bf16_fp32(ref);
ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
ofmap_data_bf16 = ofmap_data[i];
if (mode == PRE_DATA_COMPARE_FIX) {
is_close = ofmap_data[i] == ref;
} else {
is_close = fabs(ref_f - ofmap_data_f) < epsilon;
}
if (!is_close) {
float input = convert_bf16_fp32(ifmap[i]);
fprintf(stderr,
"comparing failed at ofmap_data[%lu](input:%f)\n"
"\tgot %x, exp %x, fp32: got %f exp %f, atan(%f) = %f\n",
i, input, ofmap_data_bf16, ref, ofmap_data_f, ref_f, input, _gen_atan(input));
exit(-1);
}
}
return true;
}
static void gen_input(uint16_t *input_data, uint64_t ifmap_size, TEST_MODE mode, int range_start,
int range_end) {
if (mode == PRE_DATA_COMPARE_FIX) {
memcpy(input_data, &test_pattern, sizeof(test_pattern));
} else {
std::random_device rd;
std::mt19937 e2(rd());
std::uniform_real_distribution<> dist(range_start, range_end);
int table_hw = 256;
for (uint64_t i = 0; i < ifmap_size; i++) {
// input range is -8 ~ +8
float input =
((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
// float input = ((int)i % 10) * (((int)i % 2) ? 1 : -1) + 0.03 + (i %
// table_hw) * 0.002; float input = dist(e2); input = ((int)i %
// (range_end-2)) * (((int)i % 2) ? 1 : 1) + 0.03 + (i % table_hw) *
// 0.002; if (input < 1 && input > 0) {
// input = 111.9;
//}
input_data[i] = convert_fp32_bf16(input);
}
input_data[0] = convert_fp32_bf16(0);
input_data[1] = convert_fp32_bf16(1);
input_data[2] = convert_fp32_bf16(-1);
}
#ifdef DBG
for (uint64_t i = 0; i < ifmap_size; i++) {
printf("source if[%lu] bf16 %f 0x%x, log2f is %f\n", i, convert_bf16_fp32(input_data[i]),
input_data[i], floor(log2((convert_bf16_fp32(input_data[i])))));
}
#endif /* ifdef DBG */
}
static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) {
// TODO: check more shape / align
cvk_chip_info_t chip_info = bmk->info;
uint32_t input_n = 1;
uint32_t input_c = chip_info.npu_num;
uint32_t input_h = 16;
uint32_t input_w = 16;
float epsilon = 0.01;
int range_start = -8;
int range_end = 8;
cvk_fmt_t fmt = CVK_FMT_BF16;
if (mode == PRE_DATA_COMPARE_FIX) {
input_h = 4;
input_w = 8;
}
cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
cvk_tl_shape_t ofmap_shape = ifmap_shape;
uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
int data_type_size = bytesize_of_fmt(fmt);
uint64_t ifmap_bytesize = ifmap_size * data_type_size;
uint64_t ofmap_bytesize = ofmap_size * data_type_size;
// get lut table shape and size
cvk_tl_shape_t table_shape;
uint64_t table_bytesize = cvm_lut_tbl_bytesize(bmk, &table_shape, fmt);
cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
cvk_tl_t *out = tl_ofmap_bf16;
// atan buf
cvk_tl_t *tl_y0_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_slope_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_invert_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_pos_neg_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
// reciprocal buf
cvk_tl_t *tl_reciprocal_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_reciprocal_table_answer_mantissa = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
// temp buf
cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_buf2 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1);
cvk_tl_t *tl_buf4 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1);
uint16_t *input_data = (uint16_t *)xmalloc(ifmap_bytesize);
uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
// for reciprocal
uint16_t *table_reciprocal_data = (uint16_t *)xmalloc(table_bytesize);
uint16_t *table_reciprocal_data_mantissa = (uint16_t *)xmalloc(table_bytesize);
// for atan
uint16_t *table_data_atan_y0 = (uint16_t *)xmalloc(table_bytesize);
uint16_t *table_data_atan_slope = (uint16_t *)xmalloc(table_bytesize);
uint16_t *table_data_atan_invert = (uint16_t *)xmalloc(table_bytesize);
uint16_t *table_data_atan_pos_neg = (uint16_t *)xmalloc(table_bytesize);
gen_input(input_data, ifmap_size, mode, range_start, range_end);
tl_lut_ref(ref_data, input_data, ifmap_shape);
cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape);
cvm_atan_tbl(table_data_atan_y0, table_data_atan_slope, table_data_atan_invert,
table_data_atan_pos_neg, &table_shape);
test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)input_data);
test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer, (uint8_t *)table_reciprocal_data);
test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer_mantissa,
(uint8_t *)table_reciprocal_data_mantissa);
// prepare atan
test_put_tensor_g2l_comp(ctx, bmk, tl_y0_buf, (uint8_t *)table_data_atan_y0);
test_put_tensor_g2l_comp(ctx, bmk, tl_slope_buf, (uint8_t *)table_data_atan_slope);
test_put_tensor_g2l_comp(ctx, bmk, tl_invert_buf, (uint8_t *)table_data_atan_invert);
test_put_tensor_g2l_comp(ctx, bmk, tl_pos_neg_buf, (uint8_t *)table_data_atan_pos_neg);
cvm_atan_emit(bmk, tl_ifmap, tl_buf, tl_buf2, tl_buf4, tl_y0_buf, tl_slope_buf, tl_invert_buf,
tl_pos_neg_buf, tl_reciprocal_table_answer, tl_reciprocal_table_answer_mantissa,
tl_ofmap_bf16, fmt);
test_submit_comp(ctx, bmk);
uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, out);
verify(ofmap_data, ref_data, input_data, ifmap_size, epsilon);
free_tl(bmk, tl_buf4);
free_tl(bmk, tl_buf2);
free_tl(bmk, tl_buf);
free_tl(bmk, tl_reciprocal_table_answer_mantissa);
free_tl(bmk, tl_reciprocal_table_answer);
free_tl(bmk, tl_pos_neg_buf);
free_tl(bmk, tl_invert_buf);
free_tl(bmk, tl_slope_buf);
free_tl(bmk, tl_y0_buf);
free_tl(bmk, tl_ofmap_bf16);
free_tl(bmk, tl_ifmap);
free(table_data_atan_y0);
free(table_data_atan_slope);
free(table_data_atan_invert);
free(table_data_atan_pos_neg);
free(table_reciprocal_data);
free(table_reciprocal_data_mantissa);
free(input_data);
free(ref_data);
free(ofmap_data);
}
int main() {
cvk_context_t *bmk = NULL;
int round_mode;
round_mode = set_store_feround();
CVI_RT_HANDLE ctx;
test_init(&ctx, &bmk);
// for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++)
// for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_ACCURACY; i++)
for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_U8; i++)
// for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_U8; i++)
{
mode = static_cast<TEST_MODE>(i);
printf("test mode %d...\n", mode);
testbench(&ctx, bmk);
}
printf("pass\n");
test_exit(&ctx, bmk);
restore_feround(round_mode);
return 0;
}

View File

@ -0,0 +1,667 @@
/**
* \breif atan2 is implemented by atan, you can refer
* [wiki](https://en.wikipedia.org/wiki/Atan2) for more details
*/
#include <cvimath_internal.h>
#include <test_cvikernel_util.h>
#define OUT
#define IN
#include <cfloat>
#include <iomanip>
#include <iostream>
#include <map>
#include <random>
#include <string>
//#define DBG
/**
* pre_data means we test fixed pattern, it should be same sa lut
*/
enum TEST_MODE {
PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare
DATA_COMPARE_ACCURACY, // generate \range_start to \range_end value that
// check epsilon, default set x > 0, y > 0
DATA_COMPARE_ACCURACY_X_GT_0, // atan(y/x), x > 0, y = 0
DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0, // atan(y/x) + PI , x < 0 and y >= 0
DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0, // atan(y/x) - PI , x < 0 and y < 0
DATA_COMPARE_ACCURACY_X_0_Y_GT_0, // pi / 2, x = 0 and y > 0
DATA_COMPARE_ACCURACY_X_0_Y_LT_0, // -pi / 2, x = 0 and y < 0
DATA_COMPARE_U8, // generate \range_start to \range_end value that check
// epsilon, result bf16->uint8_t
TEST_MODE_MAX,
};
static TEST_MODE mode;
static uint16_t test_pattern[] = {
0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52, 0x3A6C, 0x3A83, 0x3A90,
0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF, 0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17,
0x3B1D, 0x3B24, 0x3B2A, 0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65,
0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90, 0x3B93, 0x3B97, 0x3B9A,
0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE, 0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1,
0x3BC5, 0x3BC8, 0x3BCB, 0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9,
0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03, 0x3C05, 0x3C06, 0x3C08,
0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12, 0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C,
0x3C1D, 0x3C1F, 0x3C21, 0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F,
0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E, 0x3C40, 0x3C41, 0x3C43,
0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D, 0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57,
0x3C58, 0x3C5A, 0x3C5C, 0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A,
0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79, 0x3C7B, 0x3C7C, 0x3C7E,
0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84, 0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89,
0x3C8A, 0x3C8A, 0x3C8B, 0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93,
0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A, 0x3C9B, 0x3C9C, 0x3C9C,
0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1, 0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6,
0x3CA7, 0x3CA8, 0x3CA9, 0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0,
0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8, 0x3CB8, 0x3CB9, 0x3CBA,
0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF, 0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4,
0x3CC5, 0x3CC5, 0x3CC6, 0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE,
0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5, 0x3CD6, 0x3CD7, 0x3CD7,
0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC, 0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1,
0x3CE2, 0x3CE3, 0x3CE4, 0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB,
0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2, 0x3CF3, 0x3CF4, 0x3CF5,
0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA, 0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF,
0x3D00, 0x3D00, 0x3D01, 0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04,
0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08, 0x3D08, 0x3D09, 0x3D09,
0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C, 0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E,
0x3D0F, 0x3D0F, 0x3D0F, 0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13,
0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17, 0x3D17, 0x3D18, 0x3D18,
0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A, 0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D,
0x3D1D, 0x3D1E, 0x3D1E, 0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22,
0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25, 0x3D26, 0x3D26, 0x3D27,
0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29, 0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C,
0x3D2C, 0x3D2C, 0x3D2D, 0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31,
0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34, 0x3D35, 0x3D35, 0x3D35,
0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38, 0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A,
0x3D3B, 0x3D3B, 0x3D3C, 0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F,
0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43, 0x3D43, 0x3D44, 0x3D44,
0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47, 0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49,
0x3D4A, 0x3D4A, 0x3D4A, 0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E,
0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52, 0x3D52, 0x3D53, 0x3D53,
0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55, 0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58,
0x3D58, 0x3D59, 0x3D59, 0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D,
0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60, 0x3D61, 0x3D61, 0x3D62,
0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64, 0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67,
0x3D67, 0x3D67, 0x3D68, 0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C,
0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F, 0x3D70, 0x3D70, 0x3D70,
0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73, 0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75,
0x3D76, 0x3D76, 0x3D77, 0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A,
0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E, 0x3D7E, 0x3D7F, 0x3D7F,
0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82,
0x3D82, 0x3D82, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85,
0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D87, 0x3D87, 0x3D87,
0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89,
0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C,
0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E,
0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91,
0x3D91, 0x3D91, 0x3D91, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93,
0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95, 0x3D95, 0x3D96, 0x3D96,
0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98,
0x3D98, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B,
0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9D, 0x3D9D, 0x3D9D,
0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0,
0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2,
0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4,
0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7,
0x3DA7, 0x3DA7, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9,
0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAC, 0x3DAC,
0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE,
0x3DAE, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1,
0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3,
0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6,
0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8,
0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBB,
0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD,
0x3DBD, 0x3DBD, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF,
0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC2, 0x3DC2, 0x3DC2,
0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4,
0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7,
0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9, 0x3DC9, 0x3DC9, 0x3DC9,
0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC,
0x3DCC, 0x3DCC, 0x3DCC, 0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5,
0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE, 0x3DDF, 0x3DE0, 0x3DE1,
0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5,
};
static uint16_t golden_bf16[] = {
0x42b4, 0x42b4, 0x42b4, 0x42b4, 0x42b4, 0x42b4, 0x42b3, 0x42b3, 0x42b3, 0x42b3, 0x42b3, 0x42b3,
0x42b3, 0x42b3, 0x42b3, 0x42b3, 0x42b2, 0x42b2, 0x42b2, 0x42b2, 0x42b2, 0x42b2, 0x42b2, 0x42b2,
0x42b2, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42af,
0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42ae, 0x42ae, 0x42ae,
0x42ae, 0x42ae, 0x42ae, 0x42ae, 0x42ae, 0x42ae, 0x42ad, 0x42ad, 0x42ad, 0x42ad, 0x42ad, 0x42ad,
0x42ad, 0x42ad, 0x42ad, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac,
0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42aa, 0x42aa, 0x42aa,
0x42aa, 0x42aa, 0x42aa, 0x42aa, 0x42aa, 0x42a9, 0x42a9, 0x42a9, 0x42a9, 0x42a9, 0x42a9, 0x42a9,
0x42a9, 0x42a9, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a6, 0x42a6,
0x42a6, 0x42a6, 0x42a6, 0x42a6, 0x42a6, 0x42a6, 0x42a5, 0x42a5, 0x42a5, 0x42a5, 0x42a5, 0x42a5,
0x42a5, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a3, 0x42a3,
0x42a3, 0x42a3, 0x42a3, 0x42a3, 0x42a3, 0x42a2, 0x42a2, 0x42a2, 0x42a2, 0x42a2, 0x42a2, 0x42a2,
0x42a2, 0x42a2, 0x42a1, 0x42a1, 0x42a1, 0x42a1, 0x42a1, 0x42a1, 0x42a1, 0x42a0, 0x42a0, 0x42a0,
0x42a0, 0x42a0, 0x42a0, 0x42a0, 0x42a0, 0x429e, 0x429e, 0x429e, 0x429e, 0x429e, 0x429e, 0x429d,
0x429d, 0x429d, 0x429d, 0x429d, 0x429d, 0x429d, 0x429d, 0x429d, 0x429c, 0x429c, 0x429c, 0x429c,
0x429c, 0x429b, 0x429b, 0x429b, 0x429b, 0x429b, 0x429b, 0x429b, 0x429b, 0x429a, 0x429a, 0x429a,
0x429a, 0x429a, 0x429a, 0x4299, 0x4299, 0x4299, 0x4299, 0x4299, 0x4299, 0x4299, 0x4299, 0x4298,
0x4298, 0x4298, 0x4298, 0x4298, 0x4298, 0x4297, 0x4297, 0x4297, 0x4297, 0x4297, 0x4297, 0x4296,
0x4296, 0x4296, 0x4296, 0x4296, 0x4295, 0x4295, 0x4295, 0x4295, 0x4295, 0x4295, 0x4295, 0x4295,
0x4294, 0x4294, 0x4294, 0x4294, 0x4294, 0x4294, 0x4293, 0x4293, 0x4293, 0x4293, 0x4293, 0x4293,
0x4292, 0x4292, 0x4292, 0x4292, 0x4292, 0x4291, 0x4291, 0x4291, 0x4291, 0x4291, 0x4291, 0x4291,
0x4291, 0x428f, 0x428f, 0x428f, 0x428e, 0x428e, 0x428e, 0x428e, 0x428e, 0x428e, 0x428e, 0x428e,
0x428d, 0x428d, 0x428d, 0x428d, 0x428c, 0x428c, 0x428c, 0x428c, 0x428c, 0x428c, 0x428c, 0x428c,
0x428b, 0x428b, 0x428b, 0x428a, 0x428a, 0x428a, 0x428a, 0x428a, 0x428a, 0x428a, 0x4289, 0x4289,
0x4289, 0x4288, 0x4288, 0x4288, 0x4288, 0x4288, 0x4288, 0x4287, 0x4287, 0x4287, 0x4287, 0x4287,
0x4286, 0x4286, 0x4286, 0x4286, 0x4286, 0x4286, 0x4286, 0x4286, 0x4285, 0x4285, 0x4285, 0x4285,
0x4285, 0x4285, 0x4285, 0x4285, 0x4285, 0x4284, 0x4284, 0x4284, 0x4284, 0x4284, 0x4283, 0x4283,
0x4282, 0x4282, 0x4282, 0x4282, 0x4282, 0x4281, 0x4281, 0x4281, 0x4281, 0x4281, 0x4281, 0x4281,
0x4280, 0x4280, 0x4280, 0x427e, 0x427e, 0x427e, 0x427e, 0x427e, 0x427c, 0x427c, 0x427c, 0x427a,
0x427a, 0x427a, 0x427a, 0x427a, 0x427a, 0x4278, 0x4278, 0x4278, 0x4277, 0x4277, 0x4277, 0x4277,
0x4277, 0x4277, 0x4275, 0x4275, 0x4275, 0x4273, 0x4273, 0x4273, 0x4273, 0x4273, 0x4271, 0x4271,
0x4271, 0x4270, 0x4270, 0x4270, 0x4270, 0x4270, 0x4270, 0x4270, 0x426e, 0x426c, 0x426c, 0x426c,
0x426c, 0x426c, 0x426a, 0x426a, 0x426a, 0x426a, 0x4269, 0x4269, 0x4269, 0x4269, 0x4269, 0x4267,
0x4267, 0x4266, 0x4266, 0x4266, 0x4266, 0x4266, 0x4264, 0x4264, 0x4264, 0x4262, 0x4262, 0x4262,
0x4262, 0x4261, 0x4261, 0x4261, 0x425f, 0x425f, 0x425f, 0x425f, 0x425f, 0x425e, 0x425e, 0x425c,
0x425c, 0x425c, 0x425c, 0x425c, 0x425b, 0x425b, 0x425b, 0x4259, 0x4259, 0x4259, 0x4259, 0x4257,
0x4257, 0x4257, 0x4256, 0x4256, 0x4256, 0x4256, 0x4256, 0x4253, 0x4253, 0x4253, 0x4253, 0x4253,
0x4253, 0x4253, 0x4250, 0x4250, 0x4250, 0x4250, 0x4250, 0x424f, 0x424f, 0x424d, 0x424d, 0x424d,
0x424d, 0x424d, 0x424b, 0x424b, 0x424b, 0x424b, 0x424b, 0x4249, 0x4249, 0x4249, 0x4248, 0x4248,
0x4248, 0x4248, 0x4247, 0x4247, 0x4247, 0x4245, 0x4245, 0x4244, 0x4244, 0x4244, 0x4243, 0x4243,
0x4241, 0x4241, 0x4241, 0x4240, 0x4240, 0x4240, 0x4240, 0x4240, 0x423e, 0x423e, 0x423e, 0x423e,
0x423b, 0x423b, 0x423b, 0x423b, 0x423b, 0x423a, 0x423a, 0x423a, 0x4239, 0x4239, 0x4237, 0x4237,
0x4237, 0x4236, 0x4236, 0x4236, 0x4236, 0x4236, 0x4235, 0x4235, 0x4234, 0x4234, 0x4232, 0x4232,
0x4232, 0x4232, 0x4232, 0x4231, 0x4231, 0x4231, 0x422f, 0x422f, 0x422d, 0x422d, 0x422d, 0x422d,
0x422d, 0x422c, 0x422c, 0x422c, 0x422a, 0x422a, 0x422a, 0x422a, 0x4228, 0x4228, 0x4228, 0x4228,
0x4228, 0x4227, 0x4227, 0x4227, 0x4225, 0x4225, 0x4223, 0x4223, 0x4223, 0x4223, 0x4223, 0x4223,
0x4223, 0x4221, 0x4220, 0x4220, 0x4220, 0x4220, 0x421f, 0x421f, 0x421f, 0x421d, 0x421d, 0x421d,
0x421d, 0x421d, 0x421b, 0x421b, 0x421b, 0x421b, 0x421b, 0x4219, 0x4219, 0x4218, 0x4218, 0x4218,
0x4218, 0x4218, 0x4215, 0x4215, 0x4215, 0x4215, 0x4215, 0x4215, 0x4215, 0x4213, 0x4213, 0x4213,
0x4212, 0x4212, 0x4211, 0x4211, 0x4211, 0x420f, 0x420f, 0x420f, 0x420f, 0x420d, 0x420d, 0x420d,
0x420c, 0x420c, 0x420c, 0x420c, 0x420c, 0x420a, 0x420a, 0x4209, 0x4209, 0x4209, 0x4209, 0x4209,
0x4207, 0x4207, 0x4207, 0x4206, 0x4206, 0x4206, 0x4206, 0x4204, 0x4204, 0x4204, 0x4202, 0x4202,
0x4202, 0x4202, 0x4202, 0x4201, 0x4201, 0x41fe, 0x41fe, 0x41fe, 0x41fe, 0x41fe, 0x41fb, 0x41fb,
0x41fb, 0x41fb, 0x41f8, 0x41f8, 0x41f8, 0x41f8, 0x41f8, 0x41f4, 0x41f1, 0x41f1, 0x41f1, 0x41f1,
0x41f1, 0x41f1, 0x41f1, 0x41ed, 0x41ed, 0x41ed, 0x41ed, 0x41ed, 0x41ea, 0x41ea, 0x41ea, 0x41e6,
0x41e6, 0x41e6, 0x41e3, 0x41e3, 0x41e3, 0x41e3, 0x41e3, 0x41df, 0x41df, 0x41df, 0x41df, 0x41dc,
0x41dc, 0x41dc, 0x41dc, 0x41dc, 0x41d8, 0x41d8, 0x41d8, 0x41d8, 0x41d5, 0x41d5, 0x41d5, 0x41d5,
0x41d5, 0x41d1, 0x41d1, 0x41d1, 0x41cd, 0x41cd, 0x41cd, 0x41cd, 0x41cd, 0x41cd, 0x41cd, 0x41c9,
0x41c9, 0x41c9, 0x41c6, 0x41c6, 0x41c6, 0x41c6, 0x41c6, 0x41c6, 0x41c6, 0x41c2, 0x41c2, 0x41be,
0x41be, 0x41be, 0x41be, 0x41be, 0x41be, 0x41ba, 0x41ba, 0x41ba, 0x41ba, 0x41ba, 0x41b6, 0x41b6,
0x41b6, 0x41b6, 0x41b6, 0x41b6, 0x41b6, 0x41b2, 0x41b2, 0x41ae, 0x41ae, 0x41ae, 0x41ae, 0x41ae,
0x41ae, 0x41ae, 0x41ae, 0x41aa, 0x41aa, 0x41aa, 0x41aa, 0x41aa, 0x41a6, 0x41a6, 0x41a6, 0x41a6,
0x41a6, 0x41a2, 0x41a2, 0x41a2, 0x41a2, 0x419e, 0x419e, 0x419e, 0x419e, 0x419e, 0x419e, 0x419a,
0x419a, 0x419a, 0x419a, 0x419a, 0x4196, 0x4196, 0x4196, 0x4196, 0x4196, 0x4196, 0x4196, 0x4196,
0x4196, 0x4192, 0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418a,
0x418a, 0x418a, 0x418a, 0x418a, 0x4186, 0x4186, 0x4186, 0x4186, 0x4186, 0x4186, 0x4186, 0x4181,
0x4181, 0x4181, 0x4181, 0x4181, 0x417a, 0x417a, 0x417a, 0x417a, 0x417a, 0x417a, 0x417a, 0x417a,
0x4172, 0x4172, 0x4172, 0x4172, 0x4172, 0x4169, 0x4169, 0x4169, 0x4169, 0x4169, 0x4169, 0x4161,
0x4161, 0x4161, 0x4161, 0x4161, 0x4161, 0x4158, 0x4158, 0x4158, 0x4158, 0x4158, 0x4158, 0x4158,
0x4158, 0x4158, 0x414f, 0x414f, 0x414f, 0x414f, 0x414f, 0x4147, 0x4147, 0x4147, 0x4147, 0x4147,
0x4147, 0x4147, 0x4147, 0x413e, 0x413e, 0x413e, 0x413e, 0x413e, 0x4135, 0x4135, 0x4135, 0x4135,
0x4135, 0x4135, 0x4135, 0x4135, 0x4135, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x4123,
0x4123, 0x4123, 0x4123, 0x4123, 0x4123, 0x4123, 0x4123, 0x4123, 0x411a, 0x411a, 0x411a, 0x411a,
0x411a, 0x411a, 0x4111, 0x4111, 0x4111, 0x4111, 0x4111, 0x4111, 0x4111, 0x4111, 0x4108, 0x4108,
0x4108, 0x4108, 0x4108, 0x4108, 0x4108, 0x4108, 0x40ff, 0x40ff, 0x40ff, 0x40ff, 0x40ff, 0x40ff,
0x40ff, 0x40ff, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40db, 0x40db,
0x40db, 0x40db, 0x40db, 0x40db, 0x40db, 0x40db, 0x40c9, 0x40c9, 0x40c9, 0x40c9, 0x40c9, 0x40c9,
0x40c9, 0x40c9, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7,
0x40a5, 0x40a5, 0x40a5, 0x40a5, 0x40a5, 0x40a5, 0x40a5, 0x4092, 0x4092, 0x4092, 0x4092, 0x4092,
0x4092, 0x4092, 0x4092, 0x4092, 0x4080, 0x4080, 0x4080, 0x4080, 0x4080, 0x4080, 0x4080, 0x4080,
0x4080, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x4037, 0x4037,
0x4037, 0x4037, 0x4037, 0x4037, 0x4037, 0x4037, 0x4037, 0x4013, 0x4013, 0x4013, 0x4013, 0x4013,
0x4013, 0x4013, 0x4013, 0x4013, 0x4013, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc,
0x3fdc, 0x3fdc, 0x3fdc, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93,
0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x0, 0x0,
0x0, 0x0, 0x0, 0x0,
};
// <! gen atan2 f(y, x) = 2 * atan(y / (pow(x*x+y*y, 0.5) + x))
static double _gen_atan2_degree(float y, float x) { return atan2(y, x) * 180 / M_PI; }
static void tl_lut_ref(uint16_t *ofmap, uint16_t *ifmap, uint16_t *ifmap2,
cvk_tl_shape_t ifmap_shape) {
assert(ofmap);
for (uint32_t i = 0; i < tl_shape_size(&ifmap_shape); i++) {
float y = convert_bf16_fp32(ifmap2[i]);
float x = convert_bf16_fp32(ifmap[i]);
double v = _gen_atan2_degree(y, x);
ofmap[i] = convert_fp32_bf16(v);
if (mode == PRE_DATA_COMPARE_FIX) {
ofmap[i] = golden_bf16[i];
} else if (mode == DATA_COMPARE_U8) {
ofmap[i] = (uint8_t)convert_bf16_s8(ofmap[i]);
}
}
}
static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint16_t *ifmap, uint16_t *ifmap2,
uint64_t ifmap_size, float epsilon) {
uint64_t size = ifmap_size;
for (uint64_t i = 0; i < size; i++) {
bool is_close;
uint16_t ref = ref_data[i];
uint16_t ofmap_data_bf16;
float ref_f;
float ofmap_data_f;
ref_f = convert_bf16_fp32(ref);
ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
ofmap_data_bf16 = ofmap_data[i];
if (mode == PRE_DATA_COMPARE_FIX) {
is_close = ofmap_data[i] == ref;
} else {
is_close = fabs(ref_f - ofmap_data_f) < epsilon;
if (abs(ofmap_data_f) * epsilon == 0) {
// https://stackoverflow.com/questions/19837576/comparing-floating-point-number-to-zero
is_close = abs(ref_f) < epsilon;
} else {
is_close = fabs(ref_f - ofmap_data_f) / fabs(std::max(ref_f, ofmap_data_f)) < epsilon;
}
}
if (!is_close) {
float y = convert_bf16_fp32(ifmap2[i]);
float x = convert_bf16_fp32(ifmap[i]);
fprintf(stderr,
"comparing failed at ofmap_data[%lu]\n"
"\tgot %x, exp %x, fp32: got %f exp %f, atan2(%f, %f) = %f"
"\ty %f(0x%x), x %f(0x%x)\n",
i, ofmap_data_bf16, ref, ofmap_data_f, ref_f, y, x, _gen_atan2_degree(y, x), y,
ifmap2[i], x, ifmap[i]);
exit(-1);
}
}
return true;
}
static void _gen_input(uint16_t *input_data, uint64_t ifmap_size, int range_start, int range_end) {
std::random_device rd;
std::mt19937 e2(rd());
std::uniform_real_distribution<> dist(range_start, range_end);
float LO = pow(2, range_start);
float HI = pow(2, range_end);
for (uint64_t i = 0; i < ifmap_size; i++) {
// input range is -8 ~ +8
int table_hw = 256;
float input =
((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
input = ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : 1) + 0.03 + (i % table_hw) * 0.002;
input_data[i] = convert_fp32_bf16(input);
input = dist(e2);
input = LO + static_cast<float>(rand()) / (static_cast<float>(RAND_MAX / (HI - LO)));
}
}
static void gen_input(uint16_t *x, uint16_t *y, uint64_t ifmap_size, TEST_MODE mode,
int range_start, int range_end) {
if (mode == PRE_DATA_COMPARE_FIX) {
memcpy(x, &test_pattern, sizeof(test_pattern));
} else {
range_start = abs(range_start);
range_end = abs(range_end);
_gen_input(x, ifmap_size, range_start, range_end);
}
// invert for test
for (uint64_t i = 0; i < ifmap_size; i++) {
y[i] = x[(ifmap_size - 1) - i];
}
if (mode == DATA_COMPARE_ACCURACY_X_GT_0) {
// y = any
uint32_t i = 0;
for (; i < ifmap_size / 4; i++) {
// y < 0
y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
y[i + ifmap_size / 4] = convert_fp32_bf16(0);
}
} else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0) {
// x < 0 and y >= 0
for (uint32_t i = 0; i < ifmap_size; i++) {
x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i]));
}
for (uint32_t i = 0; i < ifmap_size / 4; i++) {
y[i + ifmap_size / 4] = convert_fp32_bf16(0);
}
} else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0) {
// x < 0 and y < 0
for (uint32_t i = 0; i < ifmap_size; i++) {
x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i]));
y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
}
} else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_GT_0) {
// pi / 2, x = 0 and y > 0
for (uint32_t i = 0; i < ifmap_size; i++) {
x[i] = convert_fp32_bf16(0);
}
} else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_LT_0) {
// -pi / 2, x = 0 and y < 0
for (uint32_t i = 0; i < ifmap_size; i++) {
x[i] = convert_fp32_bf16(0);
y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
}
}
if (mode != PRE_DATA_COMPARE_FIX) {
int i = 0;
x[i] = convert_fp32_bf16(-10.0);
y[i++] = convert_fp32_bf16(6.0);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(19.000000);
y[i] = convert_fp32_bf16(5.000000);
x[i++] = convert_fp32_bf16(-125.000000);
y[i] = convert_fp32_bf16(1.070312);
x[i++] = convert_fp32_bf16(0.498046);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-8.000000);
x[i] = convert_fp32_bf16(424.000);
y[i++] = convert_fp32_bf16(-1.00);
x[i] = convert_fp32_bf16(2.484375);
y[i++] = convert_fp32_bf16(-7.531250);
x[i] = convert_fp32_bf16(-2.484375);
y[i++] = convert_fp32_bf16(-7.531250);
x[i] = convert_fp32_bf16(0);
y[i++] = convert_fp32_bf16(7.531250);
x[i] = convert_fp32_bf16(0);
y[i++] = convert_fp32_bf16(-7.531250);
x[i] = convert_fp32_bf16(0);
y[i++] = convert_fp32_bf16(0);
x[i] = convert_fp32_bf16(0);
y[i++] = convert_fp32_bf16(0.394531);
y[i] = convert_fp32_bf16(-4.000000);
x[i++] = convert_fp32_bf16(-64.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-4.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-40.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(-53.000000);
y[i] = convert_fp32_bf16(-9.000000);
x[i++] = convert_fp32_bf16(-91.000000);
y[i] = convert_fp32_bf16(12.000000);
x[i++] = convert_fp32_bf16(-164.000000);
y[i] = convert_fp32_bf16(-20.000000);
x[i++] = convert_fp32_bf16(-320.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(-71.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(-155.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(-247.000000);
y[i] = convert_fp32_bf16(-2.000000);
x[i++] = convert_fp32_bf16(-118.000000);
y[i] = convert_fp32_bf16(-2.000000);
x[i++] = convert_fp32_bf16(-54.000000);
y[i] = convert_fp32_bf16(-5.000000);
x[i++] = convert_fp32_bf16(-392.000000);
y[i] = convert_fp32_bf16(-37.000000);
x[i++] = convert_fp32_bf16(-520.000000);
y[i] = convert_fp32_bf16(-1.000000);
x[i++] = convert_fp32_bf16(-19.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-10.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-8.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-2.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-14.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-2.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-6.000000);
y[i] = convert_fp32_bf16(-1.000000);
x[i++] = convert_fp32_bf16(-21.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-14.000000);
y[i] = convert_fp32_bf16(-1.000000);
x[i++] = convert_fp32_bf16(-17.000000);
y[i] = convert_fp32_bf16(-1.000000);
x[i++] = convert_fp32_bf16(-17.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-8.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-4.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-10.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-8.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-14.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-4.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-2.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(-41.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(-69.000000);
y[i] = convert_fp32_bf16(4.000000);
x[i++] = convert_fp32_bf16(-86.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(-41.000000);
y[i] = convert_fp32_bf16(-2.000000);
x[i++] = convert_fp32_bf16(-34.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-6.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(-41.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-136.000000);
y[i] = convert_fp32_bf16(-3.000000);
x[i++] = convert_fp32_bf16(-79.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-38.000000);
y[i] = convert_fp32_bf16(5.000000);
x[i++] = convert_fp32_bf16(-173.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-78.000000);
y[i] = convert_fp32_bf16(-2.000000);
x[i++] = convert_fp32_bf16(-60.000000);
y[i] = convert_fp32_bf16(3.000000);
x[i++] = convert_fp32_bf16(-123.000000);
y[i] = convert_fp32_bf16(-9.000000);
x[i++] = convert_fp32_bf16(-280.000000);
y[i] = convert_fp32_bf16(3.000000);
x[i++] = convert_fp32_bf16(-39.000000);
y[i] = convert_fp32_bf16(2.000000);
x[i++] = convert_fp32_bf16(-524.000000);
y[i] = convert_fp32_bf16(11.000000);
x[i++] = convert_fp32_bf16(-376.000000);
y[i] = convert_fp32_bf16(5.000000);
x[i++] = convert_fp32_bf16(-131.000000);
y[i] = convert_fp32_bf16(11.000000);
x[i++] = convert_fp32_bf16(-324.000000);
y[i] = convert_fp32_bf16(9.000000);
x[i++] = convert_fp32_bf16(-125.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-92.000000);
y[i] = convert_fp32_bf16(-7.000000);
x[i++] = convert_fp32_bf16(-233.000000);
y[i] = convert_fp32_bf16(10.000000);
x[i++] = convert_fp32_bf16(-170.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-4.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-4.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-10.000000);
y[i] = convert_fp32_bf16(-1.000000);
x[i++] = convert_fp32_bf16(-23.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-6.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-6.000000);
y[i] = convert_fp32_bf16(-3.000000);
x[i++] = convert_fp32_bf16(-37.000000);
y[i] = convert_fp32_bf16(-9);
x[i++] = convert_fp32_bf16(-1);
y[i] = convert_fp32_bf16(7.0);
x[i++] = convert_fp32_bf16(-1);
y[i] = convert_fp32_bf16(0);
x[i++] = convert_fp32_bf16(-1);
}
#ifdef DBG
for (uint64_t i = 0; i < ifmap_size; i++) {
printf("source[%ld] y %f x %f\n", i, convert_bf16_fp32(y[i]), convert_bf16_fp32(x[i]));
}
#endif /* ifdef DBG */
}
static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) {
// TODO: check more shape / align
cvk_chip_info_t chip_info = bmk->info;
uint32_t input_n = 1;
uint32_t input_c = chip_info.npu_num;
uint32_t input_h = 16;
uint32_t input_w = 16;
float epsilon = 0.2;
int range_start = -8;
int range_end = 8;
if (mode == PRE_DATA_COMPARE_FIX) {
input_h = 4;
input_w = 8;
}
cvk_fmt_t fmt = CVK_FMT_BF16;
cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
cvk_tl_shape_t ofmap_shape = ifmap_shape;
// get lut table shape and size
cvk_tl_shape_t table_shape;
uint64_t table_bytesize = cvm_lut_tbl_bytesize(bmk, &table_shape, fmt);
// get input / output size
uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
int data_type_size = bytesize_of_fmt(fmt);
uint64_t ifmap_bytesize = ifmap_size * data_type_size;
uint64_t ofmap_bytesize = ofmap_size * data_type_size;
// atan2 was two inputs
cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_ifmap2 = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
cvk_tl_t *out = tl_ofmap_bf16;
// atan buf
cvk_tl_t *tl_y0_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_invert_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_pos_neg_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
// reciprocal buf
cvk_tl_t *tl_reciprocal_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_reciprocal_table_answer_mantissa = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
// temp buf
cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_buf2 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1);
cvk_tl_t *tl_buf3 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1);
uint16_t *input_data = (uint16_t *)xmalloc(ifmap_bytesize);
uint16_t *input_data2 = (uint16_t *)xmalloc(ifmap_bytesize);
uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
// for reciprocal
uint16_t *table_reciprocal_data = (uint16_t *)xmalloc(table_bytesize);
uint16_t *table_reciprocal_data_mantissa = (uint16_t *)xmalloc(table_bytesize);
// for atan
uint16_t *table_data_atan_y0 = (uint16_t *)xmalloc(table_bytesize);
uint16_t *table_data_atan_invert = (uint16_t *)xmalloc(table_bytesize);
uint16_t *table_data_atan_pos_neg = (uint16_t *)xmalloc(table_bytesize);
// for search '0' index
uint16_t *idx_0_table_data = (uint16_t *)xmalloc(table_bytesize);
// init input / ref
// input_data is x, input_data2 is y
gen_input(input_data, input_data2, ifmap_size, mode, range_start, range_end);
tl_lut_ref(ref_data, input_data, input_data2, ifmap_shape);
// init lut table
cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape);
cvm_atan_fast_degree_tbl(table_data_atan_y0, table_data_atan_invert, table_data_atan_pos_neg,
&table_shape);
cvm_gen_0_tbl(idx_0_table_data, &table_shape);
// sys->local
test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)input_data);
test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap2, (uint8_t *)input_data2);
test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer, (uint8_t *)table_reciprocal_data);
test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer_mantissa,
(uint8_t *)table_reciprocal_data_mantissa);
test_put_tensor_g2l_comp(ctx, bmk, tl_y0_buf, (uint8_t *)table_data_atan_y0);
test_put_tensor_g2l_comp(ctx, bmk, tl_invert_buf, (uint8_t *)table_data_atan_invert);
test_put_tensor_g2l_comp(ctx, bmk, tl_pos_neg_buf, (uint8_t *)table_data_atan_pos_neg);
cvm_atan2_fast_degree_emit(bmk, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf,
tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer,
tl_reciprocal_table_answer_mantissa, OUT tl_ofmap_bf16, fmt);
uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, out);
verify(ofmap_data, ref_data, input_data, input_data2, ifmap_size, epsilon);
free_tl(bmk, tl_buf3);
free_tl(bmk, tl_buf2);
free_tl(bmk, tl_buf);
free_tl(bmk, tl_reciprocal_table_answer_mantissa);
free_tl(bmk, tl_reciprocal_table_answer);
free_tl(bmk, tl_pos_neg_buf);
free_tl(bmk, tl_invert_buf);
free_tl(bmk, tl_y0_buf);
free_tl(bmk, tl_ofmap_bf16);
free_tl(bmk, tl_ifmap2);
free_tl(bmk, tl_ifmap);
free(table_data_atan_y0);
free(idx_0_table_data);
free(table_data_atan_invert);
free(table_data_atan_pos_neg);
free(table_reciprocal_data);
free(table_reciprocal_data_mantissa);
free(input_data);
free(ref_data);
free(ofmap_data);
free(input_data2);
}
int main() {
CVI_RT_HANDLE ctx;
cvk_context_t *bmk;
int round_mode;
round_mode = set_store_feround();
test_init(&ctx, &bmk);
// for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++)
// for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_ACCURACY; i++) {
// for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_U8; i++) {
// for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_ACCURACY_X_GT_0; i++)
// {
for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_U8; i++) {
mode = static_cast<TEST_MODE>(i);
printf("test mode %d...\n", mode);
testbench(&ctx, bmk);
}
printf("pass\n");
test_exit(&ctx, bmk);
restore_feround(round_mode);
return 0;
}

View File

@ -0,0 +1,719 @@
/**
* \breif atan2 is implemented by atan, you can refer
* [wiki](https://en.wikipedia.org/wiki/Atan2) for more details
*/
#include <cvimath_internal.h>
#include <test_cvikernel_util.h>
#define OUT
#define IN
#include <cfloat>
#include <iomanip>
#include <iostream>
#include <map>
#include <random>
#include <string>
//#define DBG
/**
* pre_data means we test fixed pattern, it should be same sa lut
*/
enum TEST_MODE {
PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare
DATA_COMPARE_ACCURACY, // generate \range_start to \range_end value that
// check epsilon, default set x > 0, y > 0
DATA_COMPARE_ACCURACY_X_GT_0, // atan(y/x), x > 0, y = 0
DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0, // atan(y/x) + PI , x < 0 and y >= 0
DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0, // atan(y/x) - PI , x < 0 and y < 0
DATA_COMPARE_ACCURACY_X_0_Y_GT_0, // pi / 2, x = 0 and y > 0
DATA_COMPARE_ACCURACY_X_0_Y_LT_0, // -pi / 2, x = 0 and y < 0
DATA_COMPARE_U8, // generate \range_start to \range_end value that check
// epsilon, result bf16->uint8_t
TEST_MODE_MAX,
};
static TEST_MODE mode;
static uint16_t test_pattern[] = {
0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52, 0x3A6C, 0x3A83, 0x3A90,
0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF, 0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17,
0x3B1D, 0x3B24, 0x3B2A, 0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65,
0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90, 0x3B93, 0x3B97, 0x3B9A,
0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE, 0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1,
0x3BC5, 0x3BC8, 0x3BCB, 0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9,
0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03, 0x3C05, 0x3C06, 0x3C08,
0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12, 0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C,
0x3C1D, 0x3C1F, 0x3C21, 0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F,
0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E, 0x3C40, 0x3C41, 0x3C43,
0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D, 0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57,
0x3C58, 0x3C5A, 0x3C5C, 0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A,
0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79, 0x3C7B, 0x3C7C, 0x3C7E,
0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84, 0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89,
0x3C8A, 0x3C8A, 0x3C8B, 0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93,
0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A, 0x3C9B, 0x3C9C, 0x3C9C,
0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1, 0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6,
0x3CA7, 0x3CA8, 0x3CA9, 0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0,
0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8, 0x3CB8, 0x3CB9, 0x3CBA,
0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF, 0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4,
0x3CC5, 0x3CC5, 0x3CC6, 0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE,
0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5, 0x3CD6, 0x3CD7, 0x3CD7,
0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC, 0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1,
0x3CE2, 0x3CE3, 0x3CE4, 0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB,
0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2, 0x3CF3, 0x3CF4, 0x3CF5,
0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA, 0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF,
0x3D00, 0x3D00, 0x3D01, 0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04,
0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08, 0x3D08, 0x3D09, 0x3D09,
0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C, 0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E,
0x3D0F, 0x3D0F, 0x3D0F, 0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13,
0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17, 0x3D17, 0x3D18, 0x3D18,
0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A, 0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D,
0x3D1D, 0x3D1E, 0x3D1E, 0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22,
0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25, 0x3D26, 0x3D26, 0x3D27,
0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29, 0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C,
0x3D2C, 0x3D2C, 0x3D2D, 0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31,
0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34, 0x3D35, 0x3D35, 0x3D35,
0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38, 0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A,
0x3D3B, 0x3D3B, 0x3D3C, 0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F,
0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43, 0x3D43, 0x3D44, 0x3D44,
0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47, 0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49,
0x3D4A, 0x3D4A, 0x3D4A, 0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E,
0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52, 0x3D52, 0x3D53, 0x3D53,
0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55, 0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58,
0x3D58, 0x3D59, 0x3D59, 0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D,
0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60, 0x3D61, 0x3D61, 0x3D62,
0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64, 0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67,
0x3D67, 0x3D67, 0x3D68, 0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C,
0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F, 0x3D70, 0x3D70, 0x3D70,
0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73, 0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75,
0x3D76, 0x3D76, 0x3D77, 0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A,
0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E, 0x3D7E, 0x3D7F, 0x3D7F,
0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82,
0x3D82, 0x3D82, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85,
0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D87, 0x3D87, 0x3D87,
0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89,
0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C,
0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E,
0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91,
0x3D91, 0x3D91, 0x3D91, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93,
0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95, 0x3D95, 0x3D96, 0x3D96,
0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98,
0x3D98, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B,
0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9D, 0x3D9D, 0x3D9D,
0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0,
0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2,
0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4,
0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7,
0x3DA7, 0x3DA7, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9,
0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAC, 0x3DAC,
0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE,
0x3DAE, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1,
0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3,
0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6,
0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8,
0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBB,
0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD,
0x3DBD, 0x3DBD, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF,
0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC2, 0x3DC2, 0x3DC2,
0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4,
0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7,
0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9, 0x3DC9, 0x3DC9, 0x3DC9,
0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC,
0x3DCC, 0x3DCC, 0x3DCC, 0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5,
0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE, 0x3DDF, 0x3DE0, 0x3DE1,
0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5,
};
static uint16_t golden_bf16[] = {
0x3fc9, 0x3fc9, 0x3fc9, 0x3fc9, 0x3fc9, 0x3fc9, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8,
0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7,
0x3fc7, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc4,
0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc3, 0x3fc3, 0x3fc3,
0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1,
0x3fc1, 0x3fc1, 0x3fc1, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0,
0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbe, 0x3fbe, 0x3fbe,
0x3fbe, 0x3fbe, 0x3fbe, 0x3fbe, 0x3fbe, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc,
0x3fbc, 0x3fbc, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fba, 0x3fba,
0x3fba, 0x3fba, 0x3fba, 0x3fba, 0x3fba, 0x3fba, 0x3fb9, 0x3fb9, 0x3fb9, 0x3fb9, 0x3fb9, 0x3fb9,
0x3fb9, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb6, 0x3fb6,
0x3fb6, 0x3fb6, 0x3fb6, 0x3fb6, 0x3fb6, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5,
0x3fb5, 0x3fb5, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb2, 0x3fb2, 0x3fb2,
0x3fb2, 0x3fb2, 0x3fb2, 0x3fb2, 0x3fb2, 0x3fb1, 0x3fb1, 0x3fb1, 0x3fb1, 0x3fb1, 0x3fb1, 0x3fb0,
0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3faf, 0x3faf, 0x3faf, 0x3faf,
0x3faf, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fac, 0x3fac, 0x3fac,
0x3fac, 0x3fac, 0x3fac, 0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3faa,
0x3faa, 0x3faa, 0x3faa, 0x3faa, 0x3faa, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa7,
0x3fa7, 0x3fa7, 0x3fa7, 0x3fa7, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6,
0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa4, 0x3fa4, 0x3fa4, 0x3fa4, 0x3fa4, 0x3fa4,
0x3fa3, 0x3fa3, 0x3fa3, 0x3fa3, 0x3fa3, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1,
0x3fa1, 0x3fa0, 0x3fa0, 0x3fa0, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f,
0x3f9e, 0x3f9e, 0x3f9e, 0x3f9e, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d,
0x3f9c, 0x3f9c, 0x3f9c, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f99, 0x3f99,
0x3f99, 0x3f98, 0x3f98, 0x3f98, 0x3f98, 0x3f98, 0x3f98, 0x3f97, 0x3f97, 0x3f97, 0x3f97, 0x3f97,
0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f95, 0x3f95, 0x3f94, 0x3f94,
0x3f94, 0x3f94, 0x3f94, 0x3f94, 0x3f94, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f92, 0x3f92,
0x3f91, 0x3f91, 0x3f91, 0x3f91, 0x3f91, 0x3f90, 0x3f90, 0x3f90, 0x3f90, 0x3f90, 0x3f90, 0x3f90,
0x3f8f, 0x3f8f, 0x3f8f, 0x3f8e, 0x3f8e, 0x3f8e, 0x3f8e, 0x3f8e, 0x3f8d, 0x3f8d, 0x3f8d, 0x3f8c,
0x3f8c, 0x3f8c, 0x3f8c, 0x3f8c, 0x3f8c, 0x3f8b, 0x3f8b, 0x3f8b, 0x3f8a, 0x3f8a, 0x3f8a, 0x3f8a,
0x3f8a, 0x3f8a, 0x3f89, 0x3f89, 0x3f89, 0x3f88, 0x3f88, 0x3f88, 0x3f88, 0x3f88, 0x3f87, 0x3f87,
0x3f87, 0x3f86, 0x3f86, 0x3f86, 0x3f86, 0x3f86, 0x3f86, 0x3f86, 0x3f85, 0x3f84, 0x3f84, 0x3f84,
0x3f84, 0x3f84, 0x3f83, 0x3f83, 0x3f83, 0x3f83, 0x3f82, 0x3f82, 0x3f82, 0x3f82, 0x3f82, 0x3f81,
0x3f81, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7d, 0x3f7d, 0x3f7d,
0x3f7d, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f78, 0x3f78, 0x3f76,
0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f74, 0x3f74, 0x3f74, 0x3f72, 0x3f72, 0x3f72, 0x3f72, 0x3f71,
0x3f71, 0x3f71, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6c, 0x3f6c, 0x3f6c, 0x3f6c, 0x3f6c,
0x3f6c, 0x3f6c, 0x3f69, 0x3f69, 0x3f69, 0x3f69, 0x3f69, 0x3f67, 0x3f67, 0x3f65, 0x3f65, 0x3f65,
0x3f65, 0x3f65, 0x3f62, 0x3f62, 0x3f62, 0x3f62, 0x3f62, 0x3f61, 0x3f61, 0x3f61, 0x3f5f, 0x3f5f,
0x3f5f, 0x3f5f, 0x3f5e, 0x3f5e, 0x3f5e, 0x3f5c, 0x3f5c, 0x3f5b, 0x3f5b, 0x3f5b, 0x3f59, 0x3f59,
0x3f58, 0x3f58, 0x3f58, 0x3f57, 0x3f57, 0x3f57, 0x3f57, 0x3f57, 0x3f54, 0x3f54, 0x3f54, 0x3f54,
0x3f51, 0x3f51, 0x3f51, 0x3f51, 0x3f51, 0x3f50, 0x3f50, 0x3f50, 0x3f4e, 0x3f4e, 0x3f4d, 0x3f4d,
0x3f4d, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4a, 0x3f4a, 0x3f49, 0x3f49, 0x3f46, 0x3f46,
0x3f46, 0x3f46, 0x3f46, 0x3f45, 0x3f45, 0x3f45, 0x3f44, 0x3f44, 0x3f41, 0x3f41, 0x3f41, 0x3f41,
0x3f41, 0x3f40, 0x3f40, 0x3f40, 0x3f3e, 0x3f3e, 0x3f3e, 0x3f3e, 0x3f3c, 0x3f3c, 0x3f3c, 0x3f3c,
0x3f3c, 0x3f3a, 0x3f3a, 0x3f3a, 0x3f39, 0x3f39, 0x3f36, 0x3f36, 0x3f36, 0x3f36, 0x3f36, 0x3f36,
0x3f36, 0x3f34, 0x3f33, 0x3f33, 0x3f33, 0x3f33, 0x3f31, 0x3f31, 0x3f31, 0x3f30, 0x3f30, 0x3f30,
0x3f30, 0x3f30, 0x3f2d, 0x3f2d, 0x3f2d, 0x3f2d, 0x3f2d, 0x3f2b, 0x3f2b, 0x3f2a, 0x3f2a, 0x3f2a,
0x3f2a, 0x3f2a, 0x3f26, 0x3f26, 0x3f26, 0x3f26, 0x3f26, 0x3f26, 0x3f26, 0x3f25, 0x3f25, 0x3f25,
0x3f23, 0x3f23, 0x3f21, 0x3f21, 0x3f21, 0x3f20, 0x3f20, 0x3f20, 0x3f20, 0x3f1e, 0x3f1e, 0x3f1e,
0x3f1c, 0x3f1c, 0x3f1c, 0x3f1c, 0x3f1c, 0x3f1b, 0x3f1b, 0x3f19, 0x3f19, 0x3f19, 0x3f19, 0x3f19,
0x3f17, 0x3f17, 0x3f17, 0x3f15, 0x3f15, 0x3f15, 0x3f15, 0x3f14, 0x3f14, 0x3f14, 0x3f12, 0x3f12,
0x3f12, 0x3f12, 0x3f12, 0x3f10, 0x3f10, 0x3f0e, 0x3f0e, 0x3f0e, 0x3f0e, 0x3f0e, 0x3f0c, 0x3f0c,
0x3f0c, 0x3f0c, 0x3f0a, 0x3f0a, 0x3f0a, 0x3f0a, 0x3f0a, 0x3f08, 0x3f07, 0x3f07, 0x3f07, 0x3f07,
0x3f07, 0x3f07, 0x3f07, 0x3f05, 0x3f05, 0x3f05, 0x3f05, 0x3f05, 0x3f03, 0x3f03, 0x3f03, 0x3f01,
0x3f01, 0x3f01, 0x3efe, 0x3efe, 0x3efe, 0x3efe, 0x3efe, 0x3efa, 0x3efa, 0x3efa, 0x3efa, 0x3ef6,
0x3ef6, 0x3ef6, 0x3ef6, 0x3ef6, 0x3ef1, 0x3ef1, 0x3ef1, 0x3ef1, 0x3eed, 0x3eed, 0x3eed, 0x3eed,
0x3eed, 0x3ee9, 0x3ee9, 0x3ee9, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee1,
0x3ee1, 0x3ee1, 0x3edd, 0x3edd, 0x3edd, 0x3edd, 0x3edd, 0x3edd, 0x3edd, 0x3ed9, 0x3ed9, 0x3ed4,
0x3ed4, 0x3ed4, 0x3ed4, 0x3ed4, 0x3ed4, 0x3ed0, 0x3ed0, 0x3ed0, 0x3ed0, 0x3ed0, 0x3ecc, 0x3ecc,
0x3ecc, 0x3ecc, 0x3ecc, 0x3ecc, 0x3ecc, 0x3ec7, 0x3ec7, 0x3ec3, 0x3ec3, 0x3ec3, 0x3ec3, 0x3ec3,
0x3ec3, 0x3ec3, 0x3ec3, 0x3ebe, 0x3ebe, 0x3ebe, 0x3ebe, 0x3ebe, 0x3eba, 0x3eba, 0x3eba, 0x3eba,
0x3eba, 0x3eb5, 0x3eb5, 0x3eb5, 0x3eb5, 0x3eb1, 0x3eb1, 0x3eb1, 0x3eb1, 0x3eb1, 0x3eb1, 0x3eac,
0x3eac, 0x3eac, 0x3eac, 0x3eac, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8,
0x3ea8, 0x3ea3, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9a,
0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e91,
0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c,
0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e7b,
0x3e7b, 0x3e7b, 0x3e7b, 0x3e7b, 0x3e7b, 0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e71,
0x3e71, 0x3e71, 0x3e67, 0x3e67, 0x3e67, 0x3e67, 0x3e67, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e,
0x3e5e, 0x3e5e, 0x3e5e, 0x3e54, 0x3e54, 0x3e54, 0x3e54, 0x3e54, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a,
0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e40, 0x3e40, 0x3e40, 0x3e40, 0x3e40, 0x3e40, 0x3e36,
0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e2c, 0x3e2c, 0x3e2c, 0x3e2c,
0x3e2c, 0x3e2c, 0x3e22, 0x3e22, 0x3e22, 0x3e22, 0x3e22, 0x3e22, 0x3e22, 0x3e22, 0x3e18, 0x3e18,
0x3e18, 0x3e18, 0x3e18, 0x3e18, 0x3e18, 0x3e18, 0x3e0e, 0x3e0e, 0x3e0e, 0x3e0e, 0x3e0e, 0x3e0e,
0x3e0e, 0x3e0e, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3df5, 0x3df5,
0x3df5, 0x3df5, 0x3df5, 0x3df5, 0x3df5, 0x3df5, 0x3de0, 0x3de0, 0x3de0, 0x3de0, 0x3de0, 0x3de0,
0x3de0, 0x3de0, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc,
0x3db8, 0x3db8, 0x3db8, 0x3db8, 0x3db8, 0x3db8, 0x3db8, 0x3da3, 0x3da3, 0x3da3, 0x3da3, 0x3da3,
0x3da3, 0x3da3, 0x3da3, 0x3da3, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f,
0x3d8f, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d4d, 0x3d4d,
0x3d4d, 0x3d4d, 0x3d4d, 0x3d4d, 0x3d4d, 0x3d4d, 0x3d4d, 0x3d24, 0x3d24, 0x3d24, 0x3d24, 0x3d24,
0x3d24, 0x3d24, 0x3d24, 0x3d24, 0x3d24, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6,
0x3cf6, 0x3cf6, 0x3cf6, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4,
0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x0, 0x0,
0x0, 0x0, 0x0,
};
// <! gen atan2 f(y, x) = 2 * atan(y / (pow(x*x+y*y, 0.5) + x))
static double _gen_atan2(float y, float x) { return atan2(y, x); }
static void tl_lut_ref(uint16_t *ofmap, uint16_t *ifmap, uint16_t *ifmap2,
cvk_tl_shape_t ifmap_shape) {
assert(ofmap);
uint32_t size = tl_shape_size(&ifmap_shape);
if (mode == PRE_DATA_COMPARE_FIX) {
size = sizeof(golden_bf16) / sizeof(golden_bf16[0]);
}
for (uint32_t i = 0; i < size; i++) {
float y = convert_bf16_fp32(ifmap2[i]);
float x = convert_bf16_fp32(ifmap[i]);
double v = _gen_atan2(y, x);
ofmap[i] = convert_fp32_bf16(v);
if (mode == PRE_DATA_COMPARE_FIX) {
ofmap[i] = golden_bf16[i];
} else if (mode == DATA_COMPARE_U8) {
ofmap[i] = (uint8_t)convert_bf16_s8(ofmap[i]);
}
}
}
static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint16_t *ifmap, uint16_t *ifmap2,
uint64_t ifmap_size, float epsilon) {
uint64_t size = ifmap_size;
if (mode == PRE_DATA_COMPARE_FIX) {
size = sizeof(golden_bf16) / sizeof(golden_bf16[0]);
}
int tolerant_max = 20;
tolerant_max = -1;
int tolerant_cnt = 0;
for (uint64_t i = 0; i < size; i++) {
bool is_close;
uint16_t ref = ref_data[i];
uint16_t ofmap_data_bf16;
float ref_f;
float ofmap_data_f;
ref_f = convert_bf16_fp32(ref);
ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
ofmap_data_bf16 = ofmap_data[i];
if (mode == PRE_DATA_COMPARE_FIX) {
is_close = ofmap_data[i] == ref;
} else {
is_close = fabs(ref_f - ofmap_data_f) < epsilon;
}
if (!is_close) {
float y = convert_bf16_fp32(ifmap2[i]);
float x = convert_bf16_fp32(ifmap[i]);
fprintf(stderr,
"comparing failed at ofmap_data[%lu]\n"
"\tgot %x, exp %x, fp32: got %f exp %f, atan2(%f, %f) = %f"
"\ty %f(0x%x), x %f(0x%x)\n",
i, ofmap_data_bf16, ref, ofmap_data_f, ref_f, y, x, _gen_atan2(y, x), y, ifmap2[i], x,
ifmap[i]);
if (tolerant_cnt++ >= tolerant_max) {
exit(-1);
}
}
}
return true;
}
static void _gen_input(uint16_t *input_data, uint64_t ifmap_size, int range_start, int range_end) {
std::random_device rd;
std::mt19937 e2(rd());
std::uniform_real_distribution<> dist(range_start, range_end);
float LO = pow(2, range_start);
float HI = pow(2, range_end);
for (uint64_t i = 0; i < ifmap_size; i++) {
// input range is -8 ~ +8
int table_hw = 256;
float input =
((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
input = ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : 1) + 0.03 + (i % table_hw) * 0.002;
input_data[i] = convert_fp32_bf16(input);
input = dist(e2);
input = LO + static_cast<float>(rand()) / (static_cast<float>(RAND_MAX / (HI - LO)));
}
}
static void gen_input(uint16_t *x, uint16_t *y, uint64_t ifmap_size, TEST_MODE mode,
int range_start, int range_end) {
if (mode == PRE_DATA_COMPARE_FIX) {
memcpy(x, &test_pattern, sizeof(test_pattern));
} else {
range_start = abs(range_start);
range_end = abs(range_end);
_gen_input(x, ifmap_size, range_start, range_end);
}
// invert for test
for (uint64_t i = 0; i < ifmap_size; i++) {
y[i] = x[(ifmap_size - 1) - i];
}
if (mode == DATA_COMPARE_ACCURACY_X_GT_0) {
// y = any
uint32_t i = 0;
for (; i < ifmap_size / 4; i++) {
// y < 0
y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
y[i + ifmap_size / 4] = convert_fp32_bf16(0);
}
} else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0) {
// x < 0 and y >= 0
for (uint32_t i = 0; i < ifmap_size; i++) {
x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i]));
}
for (uint32_t i = 0; i < ifmap_size / 4; i++) {
y[i + ifmap_size / 4] = convert_fp32_bf16(0);
}
} else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0) {
// x < 0 and y < 0
for (uint32_t i = 0; i < ifmap_size; i++) {
x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i]));
y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
}
} else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_GT_0) {
// pi / 2, x = 0 and y > 0
for (uint32_t i = 0; i < ifmap_size; i++) {
x[i] = convert_fp32_bf16(0);
}
} else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_LT_0) {
// -pi / 2, x = 0 and y < 0
for (uint32_t i = 0; i < ifmap_size; i++) {
x[i] = convert_fp32_bf16(0);
y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
}
}
#if 1
if (mode != PRE_DATA_COMPARE_FIX) {
int i = 0;
x[i] = convert_fp32_bf16(0);
y[i++] = convert_fp32_bf16(1.394531);
x[i] = convert_fp32_bf16(0);
y[i++] = convert_fp32_bf16(0.394531);
x[i] = convert_fp32_bf16(0);
y[i++] = convert_fp32_bf16(0.594531);
x[i] = convert_fp32_bf16(-10.0);
y[i++] = convert_fp32_bf16(6.0);
x[i] = convert_fp32_bf16(1.0);
y[i++] = convert_fp32_bf16(-1.);
x[i] = convert_fp32_bf16(-1.0);
y[i++] = convert_fp32_bf16(1.);
x[i] = convert_fp32_bf16(0.111816);
y[i++] = convert_fp32_bf16(0);
x[i] = convert_fp32_bf16(2.031250);
y[i++] = convert_fp32_bf16(0.0);
x[i] = convert_fp32_bf16(-2.031250);
x[i] = convert_fp32_bf16(0);
y[i++] = convert_fp32_bf16(-1.394531);
y[i++] = convert_fp32_bf16(0.0);
x[i] = convert_fp32_bf16(0);
y[i++] = convert_fp32_bf16(-6.0);
x[i] = convert_fp32_bf16(0);
y[i++] = convert_fp32_bf16(-0.394531);
x[i] = convert_fp32_bf16(0);
y[i++] = convert_fp32_bf16(-0.594531);
x[i] = convert_fp32_bf16(0);
y[i++] = convert_fp32_bf16(0.0);
x[i] = convert_fp32_bf16(-8);
y[i++] = convert_fp32_bf16(0);
x[i] = convert_fp32_bf16(0);
y[i++] = convert_fp32_bf16(3.0);
x[i] = convert_fp32_bf16(-1.0);
y[i++] = convert_fp32_bf16(-5.0);
x[i] = convert_fp32_bf16(-2.484375);
y[i++] = convert_fp32_bf16(-7.531250);
x[i++] = convert_fp32_bf16(-125.000000);
y[i] = convert_fp32_bf16(5.000000);
x[i++] = convert_fp32_bf16(-8.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(19.000000);
y[i] = convert_fp32_bf16(1.070312);
x[i++] = convert_fp32_bf16(0.498046);
y[i] = convert_fp32_bf16(0.000000);
x[i] = convert_fp32_bf16(424.000);
y[i++] = convert_fp32_bf16(-1.00);
x[i] = convert_fp32_bf16(2.484375);
y[i++] = convert_fp32_bf16(-7.531250);
x[i] = convert_fp32_bf16(0);
y[i++] = convert_fp32_bf16(7.531250);
x[i] = convert_fp32_bf16(0);
y[i++] = convert_fp32_bf16(-7.531250);
x[i] = convert_fp32_bf16(0);
y[i++] = convert_fp32_bf16(0.394531);
y[i] = convert_fp32_bf16(-4.000000);
x[i++] = convert_fp32_bf16(-64.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-4.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-40.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(-53.000000);
y[i] = convert_fp32_bf16(-9.000000);
x[i++] = convert_fp32_bf16(-91.000000);
y[i] = convert_fp32_bf16(12.000000);
x[i++] = convert_fp32_bf16(-164.000000);
y[i] = convert_fp32_bf16(-20.000000);
x[i++] = convert_fp32_bf16(-320.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(-71.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(-155.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(-247.000000);
y[i] = convert_fp32_bf16(-2.000000);
x[i++] = convert_fp32_bf16(-118.000000);
y[i] = convert_fp32_bf16(-2.000000);
x[i++] = convert_fp32_bf16(-54.000000);
y[i] = convert_fp32_bf16(-5.000000);
x[i++] = convert_fp32_bf16(-392.000000);
y[i] = convert_fp32_bf16(-37.000000);
x[i++] = convert_fp32_bf16(-520.000000);
y[i] = convert_fp32_bf16(-1.000000);
x[i++] = convert_fp32_bf16(-19.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-10.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-8.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-2.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-14.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-2.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-6.000000);
y[i] = convert_fp32_bf16(-1.000000);
x[i++] = convert_fp32_bf16(-21.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-14.000000);
y[i] = convert_fp32_bf16(-1.000000);
x[i++] = convert_fp32_bf16(-17.000000);
y[i] = convert_fp32_bf16(-1.000000);
x[i++] = convert_fp32_bf16(-17.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-8.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-4.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-10.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-8.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-14.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-4.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-2.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(-41.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(-69.000000);
y[i] = convert_fp32_bf16(4.000000);
x[i++] = convert_fp32_bf16(-86.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(-41.000000);
y[i] = convert_fp32_bf16(-2.000000);
x[i++] = convert_fp32_bf16(-34.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-6.000000);
y[i] = convert_fp32_bf16(1.000000);
x[i++] = convert_fp32_bf16(-41.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-136.000000);
y[i] = convert_fp32_bf16(-3.000000);
x[i++] = convert_fp32_bf16(-79.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-38.000000);
y[i] = convert_fp32_bf16(5.000000);
x[i++] = convert_fp32_bf16(-173.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-78.000000);
y[i] = convert_fp32_bf16(-2.000000);
x[i++] = convert_fp32_bf16(-60.000000);
y[i] = convert_fp32_bf16(3.000000);
x[i++] = convert_fp32_bf16(-123.000000);
y[i] = convert_fp32_bf16(-9.000000);
x[i++] = convert_fp32_bf16(-280.000000);
y[i] = convert_fp32_bf16(3.000000);
x[i++] = convert_fp32_bf16(-39.000000);
y[i] = convert_fp32_bf16(2.000000);
x[i++] = convert_fp32_bf16(-524.000000);
y[i] = convert_fp32_bf16(11.000000);
x[i++] = convert_fp32_bf16(-376.000000);
y[i] = convert_fp32_bf16(5.000000);
x[i++] = convert_fp32_bf16(-131.000000);
y[i] = convert_fp32_bf16(11.000000);
x[i++] = convert_fp32_bf16(-324.000000);
y[i] = convert_fp32_bf16(9.000000);
x[i++] = convert_fp32_bf16(-125.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-92.000000);
y[i] = convert_fp32_bf16(-7.000000);
x[i++] = convert_fp32_bf16(-233.000000);
y[i] = convert_fp32_bf16(10.000000);
x[i++] = convert_fp32_bf16(-170.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-4.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-4.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-10.000000);
y[i] = convert_fp32_bf16(-1.000000);
x[i++] = convert_fp32_bf16(-23.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-6.000000);
y[i] = convert_fp32_bf16(0.000000);
x[i++] = convert_fp32_bf16(-6.000000);
y[i] = convert_fp32_bf16(-3.000000);
x[i++] = convert_fp32_bf16(-37.000000);
y[i] = convert_fp32_bf16(-9);
x[i++] = convert_fp32_bf16(-1);
y[i] = convert_fp32_bf16(7.0);
x[i++] = convert_fp32_bf16(-1);
y[i] = convert_fp32_bf16(0);
x[i++] = convert_fp32_bf16(-1);
}
#else
for (uint64_t i = 0; i < ifmap_size; i++) {
x[i] = convert_fp32_bf16(5.375000);
y[i] = convert_fp32_bf16(2.203125);
}
#endif
#ifdef DBG
for (uint64_t i = 0; i < ifmap_size; i++) {
printf("source[%ld] y %f x %f\n", i, convert_bf16_fp32(y[i]), convert_bf16_fp32(x[i]));
}
#endif /* ifdef DBG */
}
static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) {
// TODO: check more shape / align
cvk_chip_info_t chip_info = bmk->info;
uint32_t input_n = 1;
uint32_t input_c = chip_info.npu_num;
uint32_t input_h = 16;
uint32_t input_w = 16;
float epsilon = 0.1;
int range_start = -8;
int range_end = 8;
if (mode == PRE_DATA_COMPARE_FIX) {
input_h = 4;
input_w = 8;
}
cvk_fmt_t fmt = CVK_FMT_BF16;
cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
cvk_tl_shape_t ofmap_shape = ifmap_shape;
// get lut table shape and size
cvk_tl_shape_t table_shape;
uint64_t table_bytesize = cvm_lut_tbl_bytesize(bmk, &table_shape, fmt);
// get input / output size
uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
int data_type_size = bytesize_of_fmt(fmt);
uint64_t ifmap_bytesize = ifmap_size * data_type_size;
uint64_t ofmap_bytesize = ofmap_size * data_type_size;
if (mode == PRE_DATA_COMPARE_FIX) {
ofmap_bytesize = sizeof(golden_bf16) / sizeof(golden_bf16[0]) * data_type_size;
}
// atan2 was two inputs
cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_ifmap2 = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
cvk_tl_t *out = tl_ofmap_bf16;
// atan buf
cvk_tl_t *tl_y0_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_invert_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_pos_neg_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
// reciprocal buf
cvk_tl_t *tl_reciprocal_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_reciprocal_table_answer_mantissa = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
// temp buf
cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_buf2 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1);
cvk_tl_t *tl_buf3 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1);
uint16_t *input_data = (uint16_t *)xmalloc(ifmap_bytesize);
uint16_t *input_data2 = (uint16_t *)xmalloc(ifmap_bytesize);
uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
// for reciprocal
uint16_t *table_reciprocal_data = (uint16_t *)xmalloc(table_bytesize);
uint16_t *table_reciprocal_data_mantissa = (uint16_t *)xmalloc(table_bytesize);
// for atan
uint16_t *table_data_atan_y0 = (uint16_t *)xmalloc(table_bytesize);
uint16_t *table_data_atan_invert = (uint16_t *)xmalloc(table_bytesize);
uint16_t *table_data_atan_pos_neg = (uint16_t *)xmalloc(table_bytesize);
// for search '0' index
uint16_t *idx_0_table_data = (uint16_t *)xmalloc(table_bytesize);
// init input / ref
// input_data is x, input_data2 is y
gen_input(input_data, input_data2, ifmap_size, mode, range_start, range_end);
tl_lut_ref(ref_data, input_data, input_data2, ifmap_shape);
// init lut table
cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape);
cvm_atan_tbl(table_data_atan_y0, NULL, table_data_atan_invert, table_data_atan_pos_neg,
&table_shape);
cvm_gen_0_tbl(idx_0_table_data, &table_shape);
// sys->local
test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)input_data);
test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap2, (uint8_t *)input_data2);
test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer, (uint8_t *)table_reciprocal_data);
test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer_mantissa,
(uint8_t *)table_reciprocal_data_mantissa);
test_put_tensor_g2l_comp(ctx, bmk, tl_y0_buf, (uint8_t *)table_data_atan_y0);
test_put_tensor_g2l_comp(ctx, bmk, tl_invert_buf, (uint8_t *)table_data_atan_invert);
test_put_tensor_g2l_comp(ctx, bmk, tl_pos_neg_buf, (uint8_t *)table_data_atan_pos_neg);
cvm_atan2_merge_emit(bmk, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, tl_invert_buf,
tl_pos_neg_buf, tl_reciprocal_table_answer,
tl_reciprocal_table_answer_mantissa, OUT tl_ofmap_bf16, fmt);
uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, out);
verify(ofmap_data, ref_data, input_data, input_data2, ifmap_size, epsilon);
free_tl(bmk, tl_buf3);
free_tl(bmk, tl_buf2);
free_tl(bmk, tl_buf);
free_tl(bmk, tl_reciprocal_table_answer_mantissa);
free_tl(bmk, tl_reciprocal_table_answer);
free_tl(bmk, tl_pos_neg_buf);
free_tl(bmk, tl_invert_buf);
free_tl(bmk, tl_y0_buf);
free_tl(bmk, tl_ofmap_bf16);
free_tl(bmk, tl_ifmap2);
free_tl(bmk, tl_ifmap);
free(idx_0_table_data);
free(table_data_atan_y0);
free(table_data_atan_invert);
free(table_data_atan_pos_neg);
free(table_reciprocal_data);
free(table_reciprocal_data_mantissa);
free(input_data);
free(ref_data);
free(ofmap_data);
free(input_data2);
}
int main() {
CVI_RT_HANDLE ctx;
cvk_context_t *bmk;
int round_mode;
round_mode = set_store_feround();
test_init(&ctx, &bmk);
// for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++)
// for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_ACCURACY; i++) {
// for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_U8; i++) {
// for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_ACCURACY_X_GT_0; i++)
// {
for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_U8; i++) {
mode = static_cast<TEST_MODE>(i);
printf("test mode %d...\n", mode);
testbench(&ctx, bmk);
}
printf("pass\n");
test_exit(&ctx, bmk);
restore_feround(round_mode);
return 0;
}

View File

@ -0,0 +1,148 @@
// \file mask sample for gt(great than), ge(great equal), eq(equal), lt(less than), le(less equal)
// header include
#include <assert.h>
#include <cvimath_internal.h> // math
#include <test_cvikernel_util.h> // kerenl
void init_input(uint16_t *input_data, uint64_t ifmap_size) {
for (uint64_t i = 0; i < ifmap_size; i++) {
input_data[i] = convert_fp32_bf16(i * 1.0);
}
}
void init_ref(uint16_t *input_data, uint32_t *ref_data, uint64_t ifmap_size) {
union s {
uint16_t int16[2]; // big endian
uint32_t int32;
};
union s _s;
for (uint64_t i = 0; i < ifmap_size; i++) {
_s.int16[0] = 0;
_s.int16[1] = input_data[i];
ref_data[i] = _s.int32;
}
}
static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx,
cvk_tg_shape_t *bf16_tg_shape) {
// for calculate size we need in host
cvk_tl_shape_t ifmap_shape = {bf16_tg_shape->n, bf16_tg_shape->c, bf16_tg_shape->h,
bf16_tg_shape->w};
// * 2 means fp32 takes twice size of bf16
cvk_tl_shape_t ofmap_shape = {bf16_tg_shape->n, bf16_tg_shape->c, bf16_tg_shape->h,
bf16_tg_shape->w * 2};
uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
// unit size is 1 bytes, bf16 takes 2 bytes
int data_type_size = 2;
uint64_t ifmap_bytesize = ifmap_size * data_type_size;
// * 2 means fp32 takes twice size of bf16
uint64_t ofmap_bytesize = ofmap_size * data_type_size * 2;
uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize);
uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize);
// init input / output data in ddr
init_input((uint16_t *)input_data, ifmap_size);
init_ref((uint16_t *)input_data, (uint32_t *)ref_data, ifmap_size);
// send host memory->device memory
cvk_fmt_t fmt = CVK_FMT_BF16;
cvk_tg_shape_t fp32_tg_shape;
fp32_tg_shape = {ofmap_shape.n, ofmap_shape.c, ofmap_shape.h, ofmap_shape.w};
cvk_tg_t *bf16_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, *bf16_tg_shape, fmt);
assert(bf16_tg && "alloc bf16 fail");
test_put_tg_mem_comp(rt_ctx, bf16_tg, (uint8_t *)input_data);
cvk_tg_t *fp32_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, fp32_tg_shape, fmt);
assert(bf16_tg && "alloc fp32 fail");
// prepare command buffer
cvm_bf16_fp32(cvk_ctx, bf16_tg, fp32_tg);
// submit descriptor
test_submit_comp(rt_ctx, cvk_ctx);
// get data from tl
uint8_t *ofmap_data = test_get_tg_mem_comp(rt_ctx, fp32_tg);
// compare with reference with byte
for (uint32_t i = 0; i < ofmap_size; i++) {
if (ref_data[i] != ofmap_data[i]) {
fprintf(stderr, "comparing failed output[%u] got %u, ref %u\n", i, ofmap_data[i],
ref_data[i]);
// fail case
exit(-1);
}
}
// free resource from tpu memory
test_free_tg_mem_comp(rt_ctx, bf16_tg);
test_free_tg_mem_comp(rt_ctx, fp32_tg);
// free resource from host memory
free(input_data);
free(ref_data);
free(ofmap_data);
}
int main() {
CVI_RT_HANDLE rt_ctx;
cvk_context_t *cvk_ctx;
int round_mode;
// align kerenl rounding mode
round_mode = set_store_feround();
// init runtime / kerenl structure
test_init(&rt_ctx, &cvk_ctx);
cvk_tg_shape_t bf16_tg_shape = {1, 2, 3, 4};
{
// test 1
printf("test bf16 <%d,%d,%d,%d> to fp32\n", bf16_tg_shape.n, bf16_tg_shape.c, bf16_tg_shape.h,
bf16_tg_shape.w);
testbench(&rt_ctx, cvk_ctx, &bf16_tg_shape);
printf("compare test bf16 to fp32 done\n");
}
{
// test 2
bf16_tg_shape = {1, 20, 30, 40};
printf("test bf16 <%d,%d,%d,%d> to fp32\n", bf16_tg_shape.n, bf16_tg_shape.c, bf16_tg_shape.h,
bf16_tg_shape.w);
testbench(&rt_ctx, cvk_ctx, &bf16_tg_shape);
printf("compare test bf16 to fp32 done\n");
}
bf16_tg_shape = {40, 40, 128, 256};
for (int n = 1; n < (int)bf16_tg_shape.n; n += 10) {
for (int c = 1; c < (int)bf16_tg_shape.c; c += 10) {
for (int h = 1; h < (int)bf16_tg_shape.h; h += 100) {
for (int w = 2; w < (int)bf16_tg_shape.w; w += 100) {
printf("test bf16 <%d,%d,%d,%d> to fp32\n", bf16_tg_shape.n, bf16_tg_shape.c,
bf16_tg_shape.h, bf16_tg_shape.w);
cvk_tg_shape_t _bf16_tg_shape = {(uint32_t)n, (uint32_t)c, (uint32_t)h, (uint32_t)w};
testbench(&rt_ctx, cvk_ctx, &_bf16_tg_shape);
printf("compare test bf16 to fp32 done\n");
}
}
}
}
// de-init runtime / kerenl structure
test_exit(&rt_ctx, cvk_ctx);
// restore rounding mode
restore_feround(round_mode);
return 0;
}

View File

@ -0,0 +1,60 @@
#include <cvimath_internal.h>
#include <string.h>
#include <sys/time.h>
#include <time.h>
#include <cstdlib>
#include <iostream>
int main() {
srand(time(NULL));
const uint32_t data_length = 512;
const uint32_t data_num = 20000;
uint8_t *db = new uint8_t[data_num * data_length];
float *db_unit = new float[data_num];
uint8_t *data = new uint8_t[data_length];
float *buffer_f = new float[data_num];
memset(buffer_f, 0, data_num * sizeof(float));
for (uint32_t i = 0; i < data_length; i++) {
data[i] = rand() % 256;
}
for (uint32_t j = 0; j < data_num; j++) {
for (uint32_t i = 0; i < data_length; i++) {
db[j * data_length + i] = rand() % 256;
}
}
cvm_gen_db_unit_length(db, db_unit, data_length, data_num);
const uint32_t k = 5;
uint32_t k_index[k] = {0};
float k_value[k] = {0};
struct timeval t0, t1;
gettimeofday(&t0, NULL);
cvm_cpu_u8data_ip_match(data, db, db_unit, k_index, k_value, buffer_f, data_length, data_num, k);
gettimeofday(&t1, NULL);
unsigned long elapsed_tpu = ((t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec);
printf("Searching time uint8: %lu us\n", elapsed_tpu);
printf("Result:\n");
for (uint32_t i = 0; i < k; i++) {
printf("[%u] %f\n", k_index[i], k_value[i]);
}
printf("\n");
gettimeofday(&t0, NULL);
cvm_cpu_i8data_ip_match((int8_t *)data, (int8_t *)db, db_unit, k_index, k_value, buffer_f,
data_length, data_num, k);
gettimeofday(&t1, NULL);
elapsed_tpu = ((t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec);
printf("Searching time int8: %lu us\n", elapsed_tpu);
printf("Result:\n");
for (uint32_t i = 0; i < k; i++) {
printf("[%u] %f\n", k_index[i], k_value[i]);
}
printf("\n");
delete[] data;
delete[] db;
delete[] db_unit;
delete[] buffer_f;
return 0;
}

View File

@ -0,0 +1,134 @@
#include <cvimath_internal.h>
#include <cviruntime.h>
#include <cviruntime_context.h>
#include <string.h>
#include <sys/time.h>
#include <time.h>
#include <cmath>
#include <cstdlib>
#include <iostream>
void i8data_ip_match(CVI_RT_HANDLE ctx, cvk_context_t *cvk_ctx, uint64_t a_gaddr, int8_t *a_vaddr,
uint64_t db_gaddr, float *unit_db_arr, uint32_t *k_index, float *k_value,
uint64_t buffer_gemm_gaddr, uint8_t *buffer_gemm_vaddr, uint32_t *buffer_i32,
float *buffer_f, CVI_RT_MEM gemm_device, const uint32_t data_length,
const uint32_t data_num, const uint32_t k) {
size_t *slice_num =
cvm_gemm(cvk_ctx, a_gaddr, db_gaddr, buffer_gemm_gaddr, 1, data_length, data_num, CVK_FMT_I8);
CVI_RT_Submit(cvk_ctx);
CVI_RT_MemInvld(ctx, gemm_device);
cvm_combin_gemm_i8(slice_num, buffer_gemm_vaddr, buffer_i32, 1, data_num);
free(slice_num);
// Get a length
int32_t dot_result = 0;
for (uint32_t i = 0; i < data_length; i++) {
dot_result += ((short)a_vaddr[i] * a_vaddr[i]);
}
float unit_a = sqrt(dot_result);
// Get a length end
for (uint32_t i = 0; i < data_num; i++) {
buffer_f[i] = ((int32_t *)buffer_i32)[i] / (unit_a * unit_db_arr[i]);
}
// Get k result
for (uint32_t i = 0; i < k; i++) {
int largest = 0;
for (uint32_t j = 0; j < data_num; j++) {
if (buffer_f[j] > buffer_f[largest]) {
largest = j;
}
}
k_value[i] = buffer_f[largest];
k_index[i] = largest;
buffer_f[largest] = 0;
}
}
int main() {
CVI_RT_HANDLE ctx;
CVI_RT_Init(&ctx);
cvk_context_t *bk_ctx = (cvk_context_t *)CVI_RT_RegisterKernel(ctx, 100000);
printf("123\n");
const uint32_t data_length = 512;
const uint32_t data_num = 1000;
// Allocate memory
CVI_RT_MEM bmmem_a = CVI_RT_MemAlloc(ctx, data_length);
CVI_RT_MEM bmmem_db = CVI_RT_MemAlloc(ctx, data_length * data_num);
CVI_RT_MEM bmmem_c = CVI_RT_MemAlloc(ctx, data_num * sizeof(uint32_t));
uint64_t gaddr_a = CVI_RT_MemGetPAddr(bmmem_a);
uint64_t gaddr_db = CVI_RT_MemGetPAddr(bmmem_db);
uint64_t gaddr_c = CVI_RT_MemGetPAddr(bmmem_c);
uint8_t *vaddr_a = CVI_RT_MemGetVAddr(bmmem_a);
uint8_t *vaddr_db = CVI_RT_MemGetVAddr(bmmem_db);
uint8_t *vaddr_c = CVI_RT_MemGetVAddr(bmmem_c);
int8_t *db_raw = new int8_t[data_length * data_num];
float *db_unit = new float[data_num];
uint32_t *buffer = new uint32_t[data_num];
float *buffer_f = new float[data_num];
// Generate data
srand(time(NULL));
for (uint32_t i = 0; i < data_length; i++) {
((int8_t *)vaddr_a)[i] = rand() % 10 - 10;
}
for (uint32_t j = 0; j < data_num; j++) {
for (uint32_t i = 0; i < data_length; i++) {
((int8_t *)db_raw)[j * data_length + i] = rand() % 10 - 10;
}
}
// Pass db feature to ion
for (uint32_t n = 0; n < data_num * data_length; n++) {
int i = n / data_num;
int j = n % data_num;
((int8_t *)vaddr_db)[n] = db_raw[data_length * j + i];
}
// Calculate unit length for db feature
cvm_gen_precached_i8_unit_length((int8_t *)db_raw, db_unit, data_length, data_num);
CVI_RT_MemFlush(ctx, bmmem_a);
CVI_RT_MemFlush(ctx, bmmem_db);
const uint32_t k = 5;
uint32_t k_index[k] = {0};
float k_value[k] = {0};
struct timeval t0, t1;
gettimeofday(&t0, NULL);
i8data_ip_match(ctx, bk_ctx, gaddr_a, (int8_t *)vaddr_a, gaddr_db, db_unit, k_index, k_value,
gaddr_c, vaddr_c, buffer, buffer_f, bmmem_c, data_length, data_num, k);
gettimeofday(&t1, NULL);
unsigned long elapsed_tpu = ((t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec);
printf("Searching time tpu int8: %lu us\n", elapsed_tpu);
printf("Result:\n");
for (uint32_t i = 0; i < k; i++) {
printf("[%u] %f\n", k_index[i], k_value[i]);
}
printf("\n");
gettimeofday(&t0, NULL);
cvm_cpu_i8data_ip_match((int8_t *)vaddr_a, (int8_t *)db_raw, db_unit, k_index, k_value, buffer_f,
data_length, data_num, k);
gettimeofday(&t1, NULL);
elapsed_tpu = ((t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec);
printf("Searching time int8: %lu us\n", elapsed_tpu);
printf("Result:\n");
for (uint32_t i = 0; i < k; i++) {
printf("[%u] %f\n", k_index[i], k_value[i]);
}
printf("\n");
delete[] db_unit;
delete[] buffer;
delete[] buffer_f;
CVI_RT_MemFree(ctx, bmmem_a);
CVI_RT_MemFree(ctx, bmmem_db);
CVI_RT_MemFree(ctx, bmmem_c);
CVI_RT_UnRegisterKernel(bk_ctx);
CVI_RT_DeInit(ctx);
return 0;
}

View File

@ -0,0 +1,907 @@
#include <cvimath_internal.h>
#include <test_cvikernel_util.h>
#include <test_native_ref.h> // calc_dilute_hw
#define NPU_NUM (1 << 5)
typedef cvk_tiu_depthwise_pt_convolution_param_t param_t;
int random_seed;
static void print_pooling_param(param_t *p) {
int in = p->ifmap->shape.n;
int ic = p->ifmap->shape.c;
int ih = p->ifmap->shape.h;
int iw = p->ifmap->shape.w;
int kh = p->weight->shape.h;
int kw = p->weight->shape.w;
printf(" Pooling parameters:\n");
// printf(" random_seed : %d \n", random_seed);
printf(" ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
printf(" opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
printf(" weight = (%d, %d)\n", kh, kw);
printf(" padding = (%d, %d, %d, %d)\n", p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
printf(" stride = (%d, %d)\n", p->stride_h, p->stride_w);
// printf(" ins0 = (%d, %d, %d, %d)\n",
// p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
// printf(" dilation = (%d, %d)\n",p->dilation_h, p->dilation_w);
// printf(" rshift_bits = %d\n", p->rshift_bits);
// printf(" relu_enable = %d\n", p->relu_enable);
printf(" res0_sign = %d\n", p->ofmap->fmt == CVK_FMT_I8);
}
static uint16_t *alloc_input(int ic, int ih, int iw, cvk_fmt_t ifmt) {
uint64_t size = ic * ih * iw;
uint16_t *data = (uint16_t *)new uint16_t[(size)];
if (ifmt == CVK_FMT_BF16) {
for (uint64_t i = 0; i < size; i++) {
float val = 0;
int RAND_MAX2 = RAND_MAX / 2; // 5 ~ -5
val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX;
val = i;
data[i] = convert_fp32_bf16(val);
}
} else {
uint8_t *d = (uint8_t *)data;
for (uint64_t i = 0; i < size; i++) {
d[i] = i % 10 * (i % 2 ? -1 : 1);
}
}
return data;
}
static uint16_t *alloc_weight(int ic, int kh, int kw, cvk_fmt_t fmt) {
int size = ic * kh * kw;
uint16_t *data = (uint16_t *)malloc(size * sizeof(uint16_t));
// printf("weight size is %d\n", size * 2);
if (fmt == CVK_FMT_BF16) {
for (int i = 0; i < size; i++) {
float val = 0;
int RAND_MAX2 = RAND_MAX / 2; // 5 ~ -5
val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX;
val = i;
data[i] = convert_fp32_bf16(val);
}
} else {
uint8_t *d = (uint8_t *)data;
for (int i = 0; i < size; i++) {
d[i] = i % 5 * (i % 2 ? -1 : 1);
}
}
return data;
}
static uint32_t *alloc_bias(int ic, cvk_fmt_t fmt) {
int c = ic;
uint64_t size = c;
uint32_t *bias = (uint32_t *)malloc(sizeof(uint32_t) * c);
if (fmt == CVK_FMT_BF16) {
for (int i = 0; i < c; i++) {
float val = 0;
int RAND_MAX2 = RAND_MAX / 2; // 2 ~ -2
val = (float)(rand() - RAND_MAX2) * 2 / (float)RAND_MAX;
val = i;
bias[i] = convert_fp32_hex(val);
}
} else {
uint16_t *d = (uint16_t *)bias;
for (uint64_t i = 0; i < size; i++) {
d[i] = i % 0xf * (i % 2 ? -1 : 1);
}
}
return bias;
}
static uint16_t *alloc_output(int ic, int oh, int ow) {
uint64_t size = ic * oh * ow;
return (uint16_t *)new uint16_t[(size)];
}
static inline void cvm_relu(uint16_t *buf, uint64_t size, cvk_fmt_t fmt) {
if (fmt == CVK_FMT_BF16) {
for (uint64_t i = 0; i < size; i++)
if (convert_bf16_fp32(buf[i]) < 0) buf[i] = convert_fp32_bf16(0);
} else {
int8_t *buf_int8_t = (int8_t *)buf;
for (uint64_t i = 0; i < size; i++) {
if (buf_int8_t[i] < 0) buf_int8_t[i] = 0;
}
}
}
static int index_get(int h, int w1, int w2) { return h * w1 + w2; }
int native_pooling_avg_bf16(const uint16_t *i_fmap, const void *weight, const uint32_t *bias,
uint16_t *o_fmap, int input_n, int input_c, int input_h, int input_w,
int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
int stride_h, int stride_w, int ins_h, int ins_w, int ins_h_last,
int ins_w_last, int dh, int dw, int const_weight) {
if (kh * kw <= 0) return BM_ERR_INVALID_ARGUMENT;
uint16_t avg_const_weight = *(uint16_t *)weight;
uint16_t *weight_arr = (uint16_t *)weight;
int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
int d_kh = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
int d_kw = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
int output_h = calc_output_hw(h_after, d_kh, stride_h);
int output_w = calc_output_hw(w_after, d_kw, stride_w);
// printf("output_h/output_w is %d/%d\n", output_h, output_w);
float *avg_pooling_mac_a = (float *)malloc(d_kh * d_kw * sizeof(float));
float *avg_pooling_mac_b = (float *)malloc(d_kh * d_kw * sizeof(float));
uint16_t *i_fmap_pad = NULL;
uint16_t *i_kmap_pad = NULL;
for (int n = 0; n < input_n; n++) {
if (const_weight == 0) weight_arr = (uint16_t *)weight;
for (int c = 0; c < input_c; ++c) {
fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, 0, pad_w_l, pad_w_r, pad_h_t, pad_h_b, ins_h, ins_w,
ins_h_last, ins_w_last, input_h, input_w);
// kernel_dilation(
if (const_weight == 0)
fill_pad_fmap_bf16((weight_arr), &i_kmap_pad, 0, 0, 0, 0,
0, // no padding
dh - 1, dw - 1, 0, 0, kh, kw);
float avg_pool_result;
for (int ph = 0; ph < output_h; ++ph) {
for (int pw = 0; pw < output_w; ++pw) {
int hstart = ph * stride_h;
int wstart = pw * stride_w;
int pool_index = index_get(ph, output_w, pw);
int mac_index = 0;
float r = 0;
for (int h = 0; h < d_kh; h++) {
for (int w = 0; w < d_kw; w++) {
int index = index_get((hstart + h), w_after, (w + wstart));
mac_index = h * d_kw + w;
avg_pooling_mac_a[mac_index] = convert_bf16_fp32(i_fmap_pad[index]);
avg_pooling_mac_b[h * d_kw + w] = const_weight
? convert_bf16_fp32(avg_const_weight)
: convert_bf16_fp32(i_kmap_pad[mac_index]);
#if 0
printf ("ref[ni %u][ci %u][oh/ow %u/%u][kh/kw %u/%u] o[%d]"
" %.1f * %.1f + %.1f = %.1f\n",
n, c, ph, pw, h, w, pool_index,
avg_pooling_mac_a[mac_index], avg_pooling_mac_b[h*d_kw+w],
r, r + avg_pooling_mac_a[mac_index] * avg_pooling_mac_b[h*d_kw+w]);
#endif
r += avg_pooling_mac_a[mac_index] * avg_pooling_mac_b[h * d_kw + w];
}
}
inner_float_product(avg_pooling_mac_a, avg_pooling_mac_b, d_kh * d_kw, &avg_pool_result);
if (bias) {
avg_pool_result += convert_hex_fp32(bias[c]);
}
*(o_fmap + pool_index) = convert_fp32_bf16(avg_pool_result);
}
}
weight_arr += kh * kw;
i_fmap += input_w * input_h;
o_fmap += output_w * output_h;
}
}
free(i_fmap_pad);
free(i_kmap_pad);
free(avg_pooling_mac_a);
free(avg_pooling_mac_b);
return BM_SUCCESS;
}
static int get_fsz(cvk_fmt_t fmt) {
assert(fmt == CVK_FMT_BF16 || fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8);
return fmt == CVK_FMT_BF16 ? 2 : 1;
}
static void compare_results(param_t *p, uint16_t input[], uint16_t weight[], uint32_t bias[],
uint16_t output[], uint16_t output_ref[], uint32_t org_o_shape_size,
int is_valid_pack, int org_oc, int org_oh, int org_ow) {
assert(input);
assert(weight);
(void)input;
(void)weight;
printf("bias at %p\n", bias);
int f_sz = get_fsz(p->ofmap->fmt);
if (p->relu_enable) {
cvm_relu(output_ref, org_o_shape_size, p->ofmap->fmt);
}
int cmp_res = -1;
if (!is_valid_pack) {
// we reshape c with SAME mode padding with garbage
// \is_valid_pack set to false means we skip garbage part
int org_hw = org_oh * org_ow;
int new_hw = p->ofmap->shape.h * p->ofmap->shape.w;
int duplicated_c = p->ofmap->shape.c / org_oc;
assert(new_hw >= org_hw / duplicated_c);
int8_t *output_c = ((int8_t *)output);
int8_t *output_ref_c = ((int8_t *)output_ref);
for (int c = 0; c < org_oc; c++) {
cmp_res =
array_cmp_int8("Comparing results ...\n", output_c + c * duplicated_c * new_hw * f_sz,
output_ref_c + org_hw * c * f_sz, org_hw * f_sz);
if (cmp_res != 0) {
break;
}
// printf("compare [%d] pass, org len is %u, new len is %u\n", c,
// org_hw, duplicated_c * new_hw);
}
} else {
cmp_res = array_cmp_int8("Comparing results ...\n", (int8_t *)output_ref, (int8_t *)output,
org_o_shape_size * f_sz);
}
if (cmp_res != 0) {
printf("Comparison FAILED!!!\n");
// print_pooling_param(p);
exit(-1);
}
delete[] output_ref;
}
static int pooling_ih_ext(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int ih) {
int ins = ins_h;
int ins_last = ins_last_h;
int pad = pad_top + pad_bottom;
return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
}
static int pooling_iw_ext(int ins_w, int ins_last_w, int pad_left, int pad_right, int iw) {
int ins = ins_w;
int ins_last = ins_last_w;
int pad = pad_left + pad_right;
return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
}
static int pooling_oh(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int stride_h, int ih,
int kh, int dh) {
int ih_ext = pooling_ih_ext(ins_h, ins_last_h, pad_top, pad_bottom, ih);
int d_h = (kh - 1) * dh + 1;
return (ih_ext - d_h) / stride_h + 1;
}
static int pooling_ow(int ins_w, int ins_last_w, int pad_left, int pad_right, int stride_w, int iw,
int kw, int dw) {
int iw_ext = pooling_iw_ext(ins_w, ins_last_w, pad_left, pad_right, iw);
int d_w = (kw - 1) * dw + 1;
return (iw_ext - d_w) / stride_w + 1;
}
static void free_depthwise_struct(param_t *p) {
free((void *)p->ofmap);
free((void *)p->ifmap);
free((void *)p->weight);
if (p->bias) {
free((void *)p->bias);
}
p->ofmap = NULL;
p->ifmap = NULL;
p->weight = NULL;
p->bias = NULL;
}
static void free_depthwise_param(cvk_context_t *ctx, param_t *p) {
if (p->ofmap) free_tl(ctx, p->ofmap);
if (p->weight) free_tl(ctx, p->weight);
if (p->bias) free_tl(ctx, p->bias);
if (p->ifmap) free_tl(ctx, p->ifmap);
}
static param_t random_depthwise_param(cvk_context_t *ctx, int _ih, int _iw, int _stride_h,
cvk_fmt_t _fmt) {
param_t p;
// retry:
random_seed = clock();
srand(random_seed);
int using_bias = rand() % 2;
int n = rand() % 5 + 1;
n = 1;
int c = rand() % (3 * NPU_NUM) + 1;
c = 3;
int ih = rand() % 30 + 3;
int iw = rand() % 30 + 6;
int kh = rand() % 7 + 1;
int kw = rand() % 7 + 1;
p.ins_h = rand() % kh;
p.ins_w = rand() % kw;
p.ins_last_h = rand() % kh;
p.ins_last_w = rand() % kw;
p.stride_h = rand() % kh + 1;
p.stride_w = rand() % kw + 1;
p.pad_top = rand() % kh;
p.pad_bottom = rand() % kh;
p.pad_left = rand() % kw;
p.pad_right = rand() % kw;
p.rshift_bits = rand() % 32;
p.dilation_h = rand() % 4 + 1;
p.dilation_w = rand() % 4 + 1;
// default
cvk_fmt_t ifmt = CVK_FMT_BF16;
cvk_fmt_t other_fmt = CVK_FMT_BF16;
ih = 24;
iw = 16;
kw = 5;
kh = 5;
p.stride_h = 1;
p.stride_w = 1;
p.rshift_bits = 0;
ih = _ih;
p.stride_h = _stride_h;
iw = _iw;
ifmt = _fmt;
other_fmt = CVK_FMT_I8;
if (ifmt != CVK_FMT_BF16) {
} else {
other_fmt = CVK_FMT_BF16;
}
p.pad_left = 2;
p.pad_right = 2;
p.pad_top = 0;
p.pad_bottom = 0;
// TODO: pad / ins / dilation
p.ins_h = 0;
p.ins_last_h = 0;
p.ins_w = 0;
p.ins_last_w = 0;
p.dilation_h = 1;
p.dilation_w = 1;
int oh =
pooling_oh(p.ins_h, p.ins_last_h, p.pad_top, p.pad_bottom, p.stride_h, ih, kh, p.dilation_h);
int ow =
pooling_ow(p.ins_w, p.ins_last_w, p.pad_left, p.pad_right, p.stride_w, iw, kw, p.dilation_w);
cvk_tl_shape_t ofmap_shape;
ofmap_shape.n = n;
ofmap_shape.c = c;
ofmap_shape.h = oh;
ofmap_shape.w = ow;
cvk_tl_shape_t ifmap_shape;
ifmap_shape.n = n;
ifmap_shape.c = c;
ifmap_shape.h = ih;
ifmap_shape.w = iw;
cvk_tl_shape_t weight_shape;
weight_shape.n = 1;
weight_shape.c = c;
weight_shape.h = kh;
weight_shape.w = kw;
cvk_tl_shape_t bias_shape;
bias_shape.n = 2;
bias_shape.c = c;
bias_shape.h = 1;
bias_shape.w = 1;
p.relu_enable = rand() % 2;
// fake init for ref
cvk_tl_t *bias, *weight, *ofmap, *ifmap;
ifmap = (cvk_tl_t *)malloc(sizeof(cvk_tl_t));
if (using_bias) {
bias = (cvk_tl_t *)malloc(sizeof(cvk_tl_t));
}
weight = (cvk_tl_t *)malloc(sizeof(cvk_tl_t));
ofmap = (cvk_tl_t *)malloc(sizeof(cvk_tl_t));
p.bias = NULL;
if (using_bias) {
bias->start_address = -1;
bias->fmt = other_fmt;
bias->shape = bias_shape;
bias->stride = ctx->ops->tl_default_stride(ctx, bias->shape, other_fmt, /*eu_align*/ 0);
p.bias = bias;
}
weight->start_address = -1;
weight->fmt = other_fmt;
weight->shape = weight_shape;
weight->stride = ctx->ops->tl_default_stride(ctx, weight->shape, other_fmt, /*align*/ 1);
p.weight = weight;
ofmap->start_address = -1;
ofmap->fmt = other_fmt;
ofmap->shape = ofmap_shape;
ofmap->stride = ctx->ops->tl_default_stride(ctx, ofmap->shape, other_fmt, /*align*/ 1);
p.ofmap = ofmap;
ifmap->start_address = -1;
ifmap->fmt = ifmt;
ifmap->shape = ifmap_shape;
ifmap->stride = ctx->ops->tl_default_stride(ctx, ifmap->shape, ifmt, /*align*/ 1);
p.ifmap = ifmap;
#if 0
int d_kh = calc_dilute_hw(kh, p.dilation_h - 1, 0, 0, 0);
int d_kw = calc_dilute_hw(kw, p.dilation_w - 1, 0, 0, 0);
if ((kh > pooling_ih_ext(&p, ih))
|| (kw > pooling_iw_ext(&p, iw))
|| (oh < d_kh)
|| (ow < d_kw)
|| (p.pad_top >= (1 << 4))
|| (p.pad_bottom >= (1 << 4))
|| (p.pad_left >= (1 << 4))
|| (p.pad_right >= (1 << 4))
|| !p.ofmap
|| !p.ifmap
|| !p.weight
|| (using_bias && !p.bias)
) {
LOG(INFO) << "retry init_pooling_param";
assert(0 && "it MUST valid param pass");
goto retry;
}
#endif
return p;
}
static void put_bias_tensor(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx, const cvk_tl_t *tl,
uint32_t data[]) {
int c = tl->shape.c;
uint16_t *hi_lo = (uint16_t *)malloc(sizeof(uint16_t) * 2 * c);
if (tl->fmt == CVK_FMT_BF16) {
for (int i = 0; i < c; i++) {
hi_lo[i] = (data[i] >> 16) & 0xffff;
hi_lo[i + c] = (data[i] & 0xffff);
}
} else {
uint8_t *hi_lo_uint8_t = (uint8_t *)hi_lo;
uint16_t *data_uint16_t = (uint16_t *)data;
for (int i = 0; i < c; i++) {
hi_lo_uint8_t[i] = data_uint16_t[i] & 0xff;
hi_lo_uint8_t[i + c] = (data_uint16_t[i] >> 8) & 0xff;
}
}
put_bf16_tensor_g2l(ctx, bk_ctx, tl, (uint16_t *)hi_lo, tl->fmt);
free(hi_lo);
}
/**
* \brief
*/
static int reshape_valid_output(cvk_context_t *bk_ctx, const cvk_tl_t *ofmap, int org_oc,
int org_oh, int org_ow, cvk_tl_shape_t *tl_shape,
cvk_tl_stride_t *tl_load_stride, cvk_tg_shape_t *tg_shape,
cvk_tg_stride_t *tg_stride, cvk_fmt_t fmt) {
assert(fmt == CVK_FMT_BF16 || fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8);
// skip redundant one
// store to sys and re-slice, maybe use next layer
// sys->local skip redundant one
tg_shape->n = tl_shape->n = 1;
tg_shape->c = tl_shape->c = org_oc;
tg_shape->h = tl_shape->h = org_oh;
tg_shape->w = tl_shape->w = org_ow;
cvk_tl_stride_t s = bk_ctx->ops->tl_default_stride(bk_ctx, *tl_shape, fmt, /*eu_align*/ 0);
tl_load_stride->n = s.n;
tl_load_stride->c = s.c;
tl_load_stride->h = s.h;
tl_load_stride->w = s.w;
int duplicat_c = ofmap->shape.c / org_oc;
tg_stride->n = tg_stride->c = duplicat_c * ofmap->shape.h * ofmap->shape.w * get_fsz(fmt);
tg_stride->h = org_ow * get_fsz(fmt);
return 0;
}
static bmerr_t init_ref(int ic, int ih, int iw, int kh, int kw, int pad_right, int pad_left,
int stride_h, int stride_w, cvk_fmt_t fmt, uint16_t *input,
uint16_t *weight, uint32_t *bias, uint16_t *output_ref) {
bmerr_t ret;
int in = 1;
int ins_h = 0;
int ins_w = 0;
int ins_last_h = 0;
int ins_last_w = 0;
int dilation_h = 1;
int dilation_w = 1;
int pad_top = 0;
int pad_bottom = 0;
int rshift_bits = 0;
if (fmt == CVK_FMT_BF16) {
ret = native_pooling_avg_bf16(input, weight, bias ? bias : NULL, output_ref, in, ic, ih, iw, kh,
kw, pad_top, pad_bottom, pad_left, pad_right, stride_h, stride_w,
ins_h, ins_w, ins_last_h, ins_last_w, dilation_h, dilation_w, 0);
} else {
int opd0_sign = fmt == CVK_FMT_I8;
int res0_sign = true; //(ofmap->fmt == CVK_FMT_I8);
ret = native_pooling_ave_int8((int8_t *)input, (int8_t *)weight, bias ? (int16_t *)bias : NULL,
(int8_t *)output_ref, in, ic, ih, iw, kh, kw, pad_top, pad_bottom,
pad_left, pad_right, stride_h, stride_w, ins_h, ins_w, ins_last_h,
ins_last_w, opd0_sign, res0_sign, rshift_bits, 0);
}
return ret;
}
static int test_depthwise(CVI_RT_HANDLE ctx, cvk_context_t *bk_ctx, int ic, int ih, int iw, int kh,
int kw, int pad_right, int pad_left, int stride_h, int stride_w,
bool has_bias, cvk_fmt_t ifmt) {
// print_pooling_param(param);
param_t param;
param_t *p = &param;
assert(ifmt == CVK_FMT_BF16 || ifmt == CVK_FMT_I8 || ifmt == CVK_FMT_U8);
int in = 1;
// TODO: verify dialate > 1
int dilation_h = 1;
int dilation_w = 1;
int relu_enable = 0;
int rshift_bits = 0;
// TODO: verity ins_x
int org_oh = pooling_oh(0, 0, 0, 0, stride_h, ih, kh, dilation_h);
int org_ow = pooling_ow(0, 0, pad_left, pad_right, stride_w, iw, kw, dilation_w);
int org_oc = ic;
int org_o_shape_size = in * org_oc * org_oh * org_ow;
uint16_t *output;
cvk_tdma_g2l_tensor_copy_param_t p1;
cvk_tdma_l2g_tensor_copy_param_t p2;
// weight / ofmap not support U8 format
cvk_fmt_t other_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8;
// alloc testbench, input/ref
uint16_t *input = alloc_input(ic, ih, iw, ifmt);
uint16_t *weight = alloc_weight(ic, kh, kw, ifmt);
uint32_t *bias = NULL;
if (has_bias) bias = alloc_bias(ic, ifmt);
uint16_t *output_ref = alloc_output(ic, org_oh, org_ow);
// init ref
init_ref(ic, ih, iw, kh, kw, pad_right, pad_left, stride_h, stride_w, ifmt, input, weight, bias,
output_ref);
// assert(ret == BM_SUCCESS);
// init param
// TODO: verify pad_top/pad_bottom
// TODO: verify ins_h_x
p->pad_left = pad_left;
p->pad_right = pad_right;
p->pad_top = 0;
p->pad_bottom = 0;
p->ins_h = 0;
p->ins_last_h = 0;
p->ins_w = 0;
p->ins_last_w = 0;
p->dilation_h = dilation_h;
p->dilation_w = dilation_w;
p->stride_h = stride_h;
p->stride_w = stride_w;
p->relu_enable = relu_enable;
p->rshift_bits = rshift_bits;
p->bias = NULL;
// prepard load / input / weight / bias / output new shape / stride
cvk_tl_shape_t tl_load_shape;
cvk_tl_stride_t tl_load_stride;
cvk_tg_shape_t tg_shape;
cvk_tg_stride_t tg_stride;
cvk_tl_shape_t tl_weight_shape;
cvk_tl_shape_t tl_bias_shape;
cvk_tl_shape_t tl_output_shape;
cvk_tl_t *tmp_tl_load;
cvk_tg_t *tmp_tg;
// get reshaped information
int r = cvm_reshape_channel_same(bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, stride_h,
stride_w, &tl_load_shape, &tl_load_stride, &tg_shape, &tg_stride,
&tl_weight_shape, &tl_bias_shape, &tl_output_shape, ifmt,
/*align*/ 1);
if (r == -1) {
printf("could not reshape it, 81\n");
free_depthwise_param(bk_ctx, p);
delete[] input;
free(weight);
free(bias);
return -1;
}
// prepare input tg
{
cvk_tg_shape_t put_tg_shape;
put_tg_shape.n = in;
put_tg_shape.c = ic;
put_tg_shape.h = ih;
put_tg_shape.w = iw;
cvk_tg_t *put_tg = alloc_tg_bf16_gmem(&ctx, bk_ctx, put_tg_shape, ifmt);
put_tg_bf16_gmem(&ctx, put_tg, (uint8_t *)input);
free_tg_gmem(&ctx, put_tg);
}
// prepare load input, put to tg and load back
{
tmp_tl_load = alloc_tl_bf16(bk_ctx, tl_load_shape, ifmt, /*eu_align*/ 0);
assert(tmp_tl_load);
tmp_tg = alloc_tg_bf16_gmem(&ctx, bk_ctx, tg_shape, ifmt);
tmp_tg->stride = tg_stride;
p1.src = tmp_tg;
p1.dst = tmp_tl_load;
bk_ctx->ops->tdma_g2l_bf16_tensor_copy(bk_ctx, &p1);
test_submit_comp(&ctx, bk_ctx);
free_tg_gmem(&ctx, tmp_tg);
// fit for hw
tmp_tl_load->stride =
bk_ctx->ops->tl_default_stride(bk_ctx, tmp_tl_load->shape, ifmt, /*align*/ 1);
p->ifmap = tmp_tl_load;
}
// prepare load bias, put to tg and load back
if (has_bias) {
// bias must i8
cvk_fmt_t bias_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8;
p->bias = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_bias_shape, bias_fmt, 0);
// duplicate bias and replace old
uint32_t *new_bias = cvm_reshape_channel_bias((uint8_t *)bias, tl_bias_shape.n, tl_bias_shape.c,
tl_bias_shape.h, tl_bias_shape.w, org_oc, ifmt);
// free old one
free(bias);
bias = new_bias;
put_bias_tensor(&ctx, bk_ctx, p->bias, bias);
}
// prepare load weight, put to tg and load back
{
p->weight = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_weight_shape, other_fmt, /*align*/ 1);
assert(p->weight);
// duplicate kernel with c
uint8_t *new_weight =
cvm_reshape_channel_weight((uint8_t *)weight, tl_weight_shape.n, tl_weight_shape.c,
tl_weight_shape.h, tl_weight_shape.w, org_oc, ifmt);
// free old one
free(weight);
weight = (uint16_t *)new_weight;
put_bf16_tensor_g2l(&ctx, bk_ctx, p->weight, (uint16_t *)weight, ifmt);
}
// prepard ofmap
{
// we allocate 'same' mode shape
p->ofmap = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_output_shape, other_fmt, /*align*/ 1);
assert(p->ofmap);
}
// printf("p->ifmap at %p, c is %d\n", p->ifmap, tmp_tl_load->shape.c);
// emit
if (ifmt == CVK_FMT_BF16) {
bk_ctx->ops->tiu_pt_depthwise_convolution(bk_ctx, p);
} else {
bk_ctx->ops->tiu_pt_depthwise_convolution(bk_ctx, p);
}
// output = (uint16_t *)get_bf16_tensor_l2g(&ctx, bk_ctx, p->ofmap, ifmt);
// check with no pad if true
int is_valid_pack = false;
cvk_tl_shape_t r_ofmap_shape;
cvk_tl_stride_t r_ofmap_stride;
cvk_tg_shape_t r_tg_shape;
cvk_tg_stride_t r_tg_stride;
reshape_valid_output(bk_ctx, p->ofmap, org_oc, org_oh, org_ow, &r_ofmap_shape, &r_ofmap_stride,
&r_tg_shape, &r_tg_stride, ifmt);
p1.dst = p->ofmap;
if (is_valid_pack) {
cvk_tg_shape_t dst_shape;
dst_shape.n = p->ofmap->shape.n;
dst_shape.c = p->ofmap->shape.c;
dst_shape.h = p->ofmap->shape.h;
dst_shape.w = p->ofmap->shape.w;
cvk_tg_t *cvk_tg_tmp = alloc_tg_bf16_gmem(&ctx, bk_ctx, dst_shape, ifmt);
p2.src = p->ofmap;
p2.dst = cvk_tg_tmp;
// store for later reshape
bk_ctx->ops->tdma_l2g_bf16_tensor_copy(bk_ctx, &p2);
test_submit_comp(&ctx, bk_ctx);
// free useless for later reallocate
free_depthwise_param(bk_ctx, p);
p->ofmap = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, r_ofmap_shape, ifmt,
/*eu_align*/ 0);
assert(p->ofmap);
cvk_tg_tmp->shape = r_tg_shape;
cvk_tg_tmp->stride = r_tg_stride;
p1.src = cvk_tg_tmp;
p1.dst = p->ofmap;
bk_ctx->ops->tdma_g2l_bf16_tensor_copy(bk_ctx, &p1);
free_tg_gmem(&ctx, cvk_tg_tmp);
}
cvk_fmt_t ofmap_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8;
output = (uint16_t *)get_bf16_tensor_l2g(&ctx, bk_ctx, p1.dst, ofmap_fmt);
compare_results(p, input, weight, bias, output, output_ref, org_o_shape_size, is_valid_pack,
org_oc, org_oh, org_ow);
// free resource
if (is_valid_pack) {
free_tl(bk_ctx, p->ofmap);
} else {
free_depthwise_param(bk_ctx, p);
}
delete[] input;
free(weight);
free(bias);
free(output);
return 1;
}
static void init_input(param_t *p, int *ic, int *ih, int *iw, int *kh, int *kw, int *pad_right,
int *pad_left) {
*ic = p->ifmap->shape.c;
*ih = p->ifmap->shape.h;
*iw = p->ifmap->shape.w;
*kh = p->weight->shape.h;
*kw = p->weight->shape.w;
*pad_right = p->pad_right;
*pad_left = p->pad_left;
}
static int test_depthwise_pooling(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx) {
int loop = 1;
int test_finished_num = 0;
int ihs[] = {24, 96, 120, 480, 0};
int iws[] = {16, 17, 19, 23, 128, 256, 0};
int stride_hs[] = {3, 4, 0};
cvk_fmt_t formats[] = {CVK_FMT_I8, CVK_FMT_U8, CVK_FMT_BF16, CVK_FMT_F32};
int ic, ih, iw, kh, kw, pad_right, pad_left;
cvk_fmt_t ifmt;
param_t param;
assert(print_pooling_param);
ifmt = CVK_FMT_U8;
param = random_depthwise_param(bk_ctx, 210, 640, 1, ifmt);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
print_pooling_param(&param);
free_depthwise_struct(&param);
#if 1
param = random_depthwise_param(bk_ctx, 36, 11, 3, ifmt);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
print_pooling_param(&param);
free_depthwise_struct(&param);
ifmt = CVK_FMT_U8;
param = random_depthwise_param(bk_ctx, 24, 29, 3, ifmt);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
free_depthwise_struct(&param);
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
ifmt = CVK_FMT_BF16;
param = random_depthwise_param(bk_ctx, 480, 53, 3, ifmt);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
free_depthwise_struct(&param);
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
ifmt = CVK_FMT_I8;
param = random_depthwise_param(bk_ctx, 480, 61, 3, ifmt);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
free_depthwise_struct(&param);
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
ifmt = CVK_FMT_U8;
param = random_depthwise_param(bk_ctx, 24, 17, 3, ifmt);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
free_depthwise_struct(&param);
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
ifmt = CVK_FMT_BF16;
param = random_depthwise_param(bk_ctx, 48, 65, 3, ifmt);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
free_depthwise_struct(&param);
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
ifmt = CVK_FMT_I8;
param = random_depthwise_param(bk_ctx, 48, 63, 3, ifmt);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
free_depthwise_struct(&param);
test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
#endif
for (int i = 0; i < loop; i++) {
for (int i = 0; ihs[i] != 0; i++) {
for (int j = 0; iws[j] != 0; j++) {
for (int k = 0; stride_hs[k] != 0; k++) {
for (int l = 0; formats[l] != 0; l++) {
continue;
if (ihs[i] >= 480 && formats[l] == CVK_FMT_BF16) {
continue;
}
param = random_depthwise_param(bk_ctx, ihs[i], iws[j], stride_hs[k], formats[l]);
ifmt = formats[l];
printf("test[%d] ih/iw/sh/fmt is {%d, %d, %d, %d}\n", test_finished_num, ihs[i], iws[j],
stride_hs[k], formats[l]);
init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
free_depthwise_struct(&param);
int r = test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
param.stride_h, param.stride_w, param.bias, ifmt);
test_finished_num += r;
}
}
}
}
}
printf("Test finished %u\n", test_finished_num);
return test_finished_num;
}
int main() {
CVI_RT_HANDLE ctx;
cvk_context_t *bk_ctx;
test_init(&ctx, &bk_ctx);
int round_mode;
round_mode = set_store_feround();
int ret = test_depthwise_pooling(&ctx, bk_ctx);
assert(ret >= 0);
(void)ret;
printf("pass\n");
test_exit(&ctx, bk_ctx);
restore_feround(round_mode);
return 0;
}

View File

@ -0,0 +1,127 @@
#include <cvimath_internal.h>
#include <sys/time.h>
#include <test_cvikernel_util.h>
typedef cvk_tdma_g2g_tensor_copy_param_t param_t;
static void __print_param(const char *tag, FILE *f, param_t *p) {
fprintf(f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n", tag, p->src->shape.n, p->src->shape.c,
p->src->shape.h, p->src->shape.w, p->dst->shape.n, p->dst->shape.c, p->dst->shape.h,
p->dst->shape.w);
}
#define print_param(f, p) __print_param(__func__, f, p)
typedef struct {
cvk_tg_shape_t src_shape;
cvk_tg_shape_t dst_shape;
} case_t;
static cvk_fmt_type input_fmt[] = {
{CVK_FMT_BF16, CVK_FMT_BF16},
};
static case_t g_cases[] = {
{
{1, 3, 3, 2},
{1, 3, 3, 2},
},
{
{4, 3, 3, 2},
{4, 3, 3, 2},
},
//{
// // YOLOv2 concat layer
// {1, 256, 19, 19},
// {1, 256, 19, 19},
//},
{
{1, 256, 19, 20},
{1, 256, 19, 20},
},
{
{1, 1280, 3, 4},
{1, 1280, 3, 4},
},
{
{1, 159 * 89, 36, 4},
{1, 159 * 89, 36, 4},
},
{
{159, 89, 36, 4},
{159, 89, 36, 4},
},
};
static void test_param_g2g(CVI_RT_HANDLE *ctx, cvk_context_t *bmk, param_t *p) {
print_param(stderr, p);
// 2 means source is fp32, occupy 2 * bf16 size
uint64_t size = p->src->shape.n * p->src->shape.c * p->src->shape.h * p->src->shape.w / 2;
uint32_t *src_data = new uint32_t[size];
for (uint64_t i = 0; i < size; i++) {
src_data[i] = ((0x1234 + i) << 16) + 0x5678 + i;
// printf("src[%lu] 0x%x\n", i, src_data[i]);
}
test_put_tg_mem_comp(ctx, p->src, (uint8_t *)src_data);
cvm_s2s_fp32_bf16(bmk, p->src->start_address, p->src->shape, p->dst->start_address, p->dst->shape,
CVK_FMT_BF16);
long elapsed;
struct timeval t0, t1;
gettimeofday(&t0, NULL);
test_submit_comp(ctx, bmk);
gettimeofday(&t1, NULL);
elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
printf("kernel takes %ld us\n", elapsed);
uint16_t *dst_data = (uint16_t *)test_get_tg_mem_comp(ctx, p->dst);
for (uint64_t i = 0; i < size; i++) {
uint16_t _src_data = (src_data[i] >> 16) & 0xffff;
if (dst_data[i] != _src_data) {
fprintf(stderr, "comparing failed at dst[%lu], got %x, exp %x\n", i, dst_data[i], _src_data);
exit(-1);
}
}
delete[] src_data;
free(dst_data);
}
static void destroy_param_g2g(CVI_RT_HANDLE *ctx, param_t *p) {
test_free_tg_mem_comp(ctx, p->src);
test_free_tg_mem_comp(ctx, p->dst);
}
static void test_one_case(CVI_RT_HANDLE *ctx, cvk_context_t *bmk, case_t *c) {
uint32_t nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
for (uint32_t i = 0; i < nr_fmt; i++) {
param_t p;
cvk_tg_t *src, *dst;
src = test_alloc_tg_mem_comp(ctx, bmk, c->src_shape, input_fmt[i].src_fmt);
dst = test_alloc_tg_mem_comp(ctx, bmk, c->dst_shape, input_fmt[i].dst_fmt);
p.src = src;
p.dst = dst;
test_param_g2g(ctx, bmk, &p);
destroy_param_g2g(ctx, &p);
}
}
int main() {
CVI_RT_HANDLE ctx;
cvk_context_t *bmk;
test_init(&ctx, &bmk);
uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
for (uint32_t i = 0; i < nr_cases; i++) test_one_case(&ctx, bmk, &g_cases[i]);
test_exit(&ctx, bmk);
return 0;
}

View File

@ -0,0 +1,845 @@
#include <cvimath_internal.h>
#include <sys/time.h>
#include <test_cvikernel_util.h>
#include <time.h> // clock
typedef cvk_tiu_matrix_multiplication_param_t param_t;
int random_seed;
static uint64_t matrix_size(const cvk_ml_t *ml) {
uint64_t row = ml->shape.n;
uint64_t col = ml->shape.col;
return row * col;
}
static uint64_t res_size(param_t *p) { return matrix_size(p->res); }
static uint16_t *alloc_left(param_t *p) {
uint64_t size = matrix_size(p->left);
uint16_t *buf = new uint16_t[size];
for (uint64_t i = 0; i < size; i++) {
buf[i] = convert_fp32_bf16(i);
}
return buf;
}
static uint16_t *alloc_right(param_t *p) {
uint64_t size = matrix_size(p->right);
uint16_t *buf = new uint16_t[size];
for (uint64_t i = 0; i < size; i++) {
float val = 0.01;
buf[i] = convert_fp32_bf16(i);
val += 0.01;
}
return buf;
}
static uint32_t *alloc_bias(param_t *p) {
if (!p->bias) return NULL;
uint64_t size = matrix_size(p->bias);
uint32_t *buf = new uint32_t[size];
for (uint64_t i = 0; i < size; i++) {
buf[i] = convert_fp32_hex(i);
}
return buf;
}
static uint32_t *alloc_res(param_t *p) {
uint64_t size = res_size(p);
uint32_t *buf = new uint32_t[size];
for (uint64_t i = 0; i < size; i++) {
buf[i] = convert_fp32_bf16(i);
}
return buf;
}
static inline void cvm_relu(float *buf, uint64_t size) {
for (uint64_t i = 0; i < size; i++)
if (buf[i] < 0) buf[i] = 0;
}
static void matrix_mac_ref(param_t *p, uint16_t left[], uint16_t right[], uint32_t bias[],
uint32_t res[]) {
uint64_t size = res_size(p);
uint32_t left_col = p->left->shape.col;
uint32_t right_col = p->right->shape.col;
uint32_t res_row = p->left->shape.n;
uint32_t res_col = p->res->shape.col;
uint32_t left_c = p->left->shape.c;
uint32_t left_w = p->left->shape.w;
float *tmp_res = new float[size];
if (p->add_result) {
for (uint32_t i = 0; i < res_row * res_col; i++) tmp_res[i] = convert_bf16_fp32(res[i]);
} else {
for (uint32_t i = 0; i < res_row * res_col; i++) tmp_res[i] = 0;
}
for (uint32_t row = 0; row < res_row; row++) {
for (uint32_t col = 0; col < res_col; col++) {
for (uint32_t wi = 0; wi < left_w; wi++) {
for (uint32_t ci = 0; ci < left_c; ci++) {
if ((wi + (ci * left_w)) >= left_col) continue;
uint32_t li = row * left_col + left_w * ci + wi;
uint32_t ri = (ci * left_w + wi) * right_col + col;
float l = convert_bf16_fp32(left[li]);
float r = convert_bf16_fp32(right[ri]);
tmp_res[row * res_col + col] += l * r;
}
}
}
}
if (p->bias) {
for (uint32_t row = 0; row < res_row; row++) {
for (uint32_t col = 0; col < res_col; col++) {
float b = convert_hex_fp32(bias[col]);
tmp_res[row * res_col + col] += b;
}
}
}
if (p->relu_enable) cvm_relu(tmp_res, size);
for (uint64_t i = 0; i < size; i++) {
res[i] = convert_fp32_bf16(tmp_res[i]);
}
delete[] tmp_res;
}
static void put_bias(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx, const cvk_ml_t *ml,
uint32_t data[]) {
uint64_t size = ml->shape.col;
uint16_t *tmp = new uint16_t[size * 2];
for (uint64_t i = 0; i < size; i++) {
tmp[i] = (data[i] >> 16) & 0xFFFF;
tmp[i + size] = (data[i] & 0xFFFF);
}
test_put_matrix_g2l_comp(ctx, bk_ctx, ml, (uint8_t *)tmp);
delete[] tmp;
}
static void put_res(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx, const cvk_ml_t *ml,
uint32_t data[]) {
uint64_t size = ml->shape.n * ml->shape.col;
uint16_t *tmp = new uint16_t[size];
for (uint64_t i = 0; i < size; i++) {
tmp[i] = (data[i] & 0xFFFF);
}
test_put_matrix_g2l_comp(ctx, bk_ctx, ml, (uint8_t *)tmp);
delete[] tmp;
}
static uint32_t *get_res(CVI_RT_HANDLE *ctx, cvk_mg_t *mg, param_t *p) {
uint64_t size = res_size(p);
uint32_t *res = new uint32_t[size];
uint16_t *tmp = (uint16_t *)test_get_mg_mem_comp(ctx, mg);
for (uint64_t i = 0; i < size; i++) res[i] = tmp[i];
delete[] tmp;
return res;
}
static inline cvk_mg_t *put_bf16_matrix_g(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx,
const cvk_ml_t *ml, uint8_t data[],
cvk_fmt_t mg_data_format) {
cvk_mg_shape_t s;
s.row = ml->shape.n;
s.col = ml->shape.col;
cvk_mg_t *mg = test_alloc_mg_mem_comp(ctx, s, mg_data_format);
test_put_mg_mem_comp(ctx, mg, data);
test_submit_comp(ctx, bk_ctx);
return mg;
}
static void test_param(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx, param_t *p) {
uint16_t *left = alloc_left(p);
uint16_t *right = alloc_right(p);
uint32_t *bias = alloc_bias(p);
uint32_t *ref = alloc_res(p);
cvk_mg_t *left_mg = put_bf16_matrix_g(ctx, bk_ctx, p->left, (uint8_t *)left, CVK_FMT_BF16);
cvk_mg_t *right_mg = put_bf16_matrix_g(ctx, bk_ctx, p->right, (uint8_t *)right, CVK_FMT_BF16);
cvk_mg_shape_t s;
s.row = p->res->shape.n;
s.col = p->res->shape.col;
cvk_mg_t *result_mg = test_alloc_mg_mem_comp(ctx, s, CVK_FMT_BF16);
if (bias) put_bias(ctx, bk_ctx, p->bias, bias);
if (p->add_result) put_res(ctx, bk_ctx, p->res, ref);
printf("start\n");
size_t *slice_num =
cvm_gemm(bk_ctx, left_mg->start_address, right_mg->start_address, result_mg->start_address,
p->left->shape.n, p->left->shape.col, p->res->shape.col, CVK_FMT_BF16);
free(slice_num); // no need use in bf16
test_submit_comp(ctx, bk_ctx);
uint32_t *res = get_res(ctx, result_mg, p);
matrix_mac_ref(p, left, right, bias, ref);
uint64_t size = res_size(p);
for (uint64_t i = 0; i < size; i++) {
if (res[i] != ref[i]) {
uint16_t _res = res[i] & 0xffff;
uint16_t _ref = ref[i] & 0xffff;
fprintf(stderr, "comparing failed at out[%lu], got %f(0x%x), exp %f(0x%x)\n", i,
convert_bf16_fp32(_res), res[i], convert_bf16_fp32(_ref), ref[i]);
fprintf(stderr, "random_seed=%d\n", random_seed);
exit(-1);
}
}
test_free_mg_mem_comp(ctx, left_mg);
test_free_mg_mem_comp(ctx, right_mg);
test_free_mg_mem_comp(ctx, result_mg);
delete[] left;
delete[] right;
delete[] bias;
delete[] res;
}
static void destroy_param(cvk_context_t *bk_ctx, param_t *p) {
if (p->bias) bk_ctx->ops->lmem_free_matrix(bk_ctx, p->bias);
if (p->res) bk_ctx->ops->lmem_free_matrix(bk_ctx, p->res);
if (p->right) bk_ctx->ops->lmem_free_matrix(bk_ctx, p->right);
if (p->left) bk_ctx->ops->lmem_free_matrix(bk_ctx, p->left);
}
static cvk_ml_t *alloc_param_res(cvk_context_t *bk_ctx, param_t *p) {
cvk_ml_shape_t s;
s.n = p->left->shape.n;
s.c = p->right->shape.c;
s.w = p->right->shape.w;
s.col = p->right->shape.col;
cvk_fmt_t fmt = CVK_FMT_BF16;
cvk_ml_shape_t fake;
fake.n = 1;
fake.c = 1;
fake.w = 1;
fake.col = 1;
cvk_ml_t *t = bk_ctx->ops->lmem_alloc_matrix(bk_ctx, fake, fmt, 1);
t->shape = s;
return t;
}
static param_t param_0(cvk_context_t *bk_ctx) {
retry:
random_seed = clock();
srand(random_seed);
param_t p;
memset(&p, 0, sizeof(p));
p.lshift_bits = 0;
p.rshift_bits = 0;
p.res_is_int8 = true;
p.relu_enable = rand() % 2;
p.relu_enable = 0;
p.add_result = 0; /*bf16 HW does not support add_result*/
p.ps32_mode = 0;
uint32_t left_row = rand() % 100 + 1;
uint32_t left_col = rand() % 100 + 1;
left_row = 1024;
left_col = 1024;
uint32_t left_w = rand() % (left_col / 5 + 1) + 1; // c is generate by w, and make c is larger
uint32_t left_c = left_col / left_w + (left_col % left_w ? 1 : 0);
uint32_t right_row = left_col;
uint32_t right_col = rand() % 100 + 1;
right_col = 1024;
uint32_t right_w = (rand() % (right_col / 5 + 1) + 1); // make c is larger
uint32_t right_c = right_col / right_w + (right_col % right_w ? 1 : 0);
cvk_ml_shape_t left_shape;
left_shape.n = left_row;
left_shape.c = left_c;
left_shape.w = left_w;
left_shape.col = left_col;
cvk_ml_shape_t right_shape;
right_shape.n = right_row;
right_shape.c = right_c;
right_shape.w = right_w;
right_shape.col = right_col;
uint32_t bias = rand() % 2;
bias = 0;
p.bias = NULL;
cvk_ml_shape_t fake;
fake.n = 1;
fake.c = 1;
fake.w = 1;
fake.col = 1;
cvk_ml_t *t = bk_ctx->ops->lmem_alloc_matrix(bk_ctx, fake, CVK_FMT_BF16, 1);
t->shape = left_shape;
p.left = t;
t = bk_ctx->ops->lmem_alloc_matrix(bk_ctx, fake, CVK_FMT_BF16, 1);
t->shape = right_shape;
p.right = t;
if (!p.left || !p.right) {
printf("retry init_matrix_param\n");
destroy_param(bk_ctx, &p);
goto retry;
}
p.res = alloc_param_res(bk_ctx, &p);
if (bias) {
cvk_ml_shape_t bias_shape = right_shape;
bias_shape.n = 2;
p.bias = bk_ctx->ops->lmem_alloc_matrix(bk_ctx, bias_shape, CVK_FMT_BF16, 1);
}
if (!p.res || (bias && !p.bias)) {
printf("retry init_matrix_param\n");
destroy_param(bk_ctx, &p);
goto retry;
}
return p;
}
// gemm test function
//#define USE_CBLAS_VERITY (1)
#ifdef USE_CBLAS_VERITY
#include <cblas.h>
#endif /* ifdef USE_CBLAS_VERITY */
// comes from
// https://stackoverflow.com/questions/47023651/multiplying-matrices-in-one-dimensional-arrays
void multiply(uint16_t *a, int row1, int col1, uint16_t *b, int row2, int col2, uint16_t *d) {
assert(col1 == row2);
// silence error=unused-but-set-parameter warning
(void)row2;
for (int i = 0; i < row1; i++) {
for (int j = 0; j < col2; j++) {
float sum = 0;
for (int k = 0; k < col1; k++) {
float _a = convert_bf16_fp32(a[i * col1 + k]);
float _b = convert_bf16_fp32(b[k * col2 + j]);
sum = sum + _a * _b;
}
d[i * col2 + j] = convert_fp32_bf16(sum);
}
}
#if 0
for (int i = 0; i < size; i++) {
if (i % col2 == 0) {
printf("\n");
}
printf("%f ", convert_bf16_fp32(d[i]));
}
#endif
}
#ifdef USE_CBLAS_VERITY
#else
static void multiply_i32(uint8_t *a, int row1, int col1, uint8_t *b, int row2, int col2,
uint32_t *d, cvk_fmt_t fmt) {
assert(col1 == row2);
// silence error=unused-but-set-parameter warning
(void)row2;
for (int i = 0; i < row1; i++) {
for (int j = 0; j < col2; j++) {
int sum = 0;
for (int k = 0; k < col1; k++) {
int _a = fmt == CVK_FMT_I8 ? (int8_t)(a[i * col1 + k]) : (a[i * col1 + k]);
int _b = fmt == CVK_FMT_I8 ? (int8_t)(b[k * col2 + j]) : (b[k * col2 + j]);
// printf("sum = sum + _a * _b = %d = %d + %d * %d\n", sum + _a * _b, sum, _a, _b);
sum = sum + _a * _b;
}
// printf("out [%d] is %d\n", i * col2 + j, sum);
d[i * col2 + j] = (sum);
}
}
#if 0
for (int i = 0; i < size; i++) {
if (i % col2 == 0) {
printf("\n");
}
printf("%f ", convert_bf16_fp32(d[i]));
}
#endif
}
#endif /* ifdef USE_CBLAS_VERITY */
int array_cmp_int16(const char *const info, const uint16_t *p_exp, const uint16_t *p_got,
int count) {
int idx;
for (idx = 0; idx < count; idx++) {
if (p_exp[idx] != p_got[idx]) {
printf("%s error at index %d exp %d(%f,0x%x) got %d(%f,0x%x)\n", info, idx, p_exp[idx],
convert_bf16_fp32(p_exp[idx]), p_exp[idx], p_got[idx], convert_bf16_fp32(p_got[idx]),
p_got[idx]);
return -1;
}
}
return 0;
}
int array_cmp_int32(const char *const info, const uint32_t *p_exp, const uint32_t *p_got,
int count) {
int idx;
for (idx = 0; idx < count; idx++) {
if (p_exp[idx] != p_got[idx]) {
printf("%s error at index %d exp %d got %d\n", info, idx, p_exp[idx], p_got[idx]);
return -1;
}
}
return 0;
}
static void assign_bf16_values_to_matrix(uint16_t *matrix, size_t size) {
float t;
for (size_t i = 0; i < size; i++) {
float f;
#if 1
if (i % 2 == 0) t = i % 8;
if (i % 2 == 1) t = -1 * (i % 8);
f = t;
#else
t = i * (i % 2 ? -1 : 1);
f = t * 0.01 + size * 0.01;
#endif
matrix[i] = convert_fp32_bf16(f);
// printf("f[%lu] is %f(0x%x)\n", i, f, matrix[i]);
}
}
static void uint16_to_float(float *float_data, uint16_t *bf16_data, size_t size) {
for (size_t i = 0; i < size; i++) {
float_data[i] = convert_bf16_fp32(bf16_data[i]);
}
}
static void uint8_to_float(float *float_data, uint8_t *i8_data, size_t size, cvk_fmt_t fmt) {
for (size_t i = 0; i < size; i++) {
int input = (i8_data[i]);
if (fmt == CVK_FMT_I8) {
input = (int8_t)(i8_data[i]);
}
float_data[i] = (float)input;
}
}
static void assign_i8_values_to_matrix(uint8_t *matrix, size_t size) {
for (size_t i = 0; i < size; i++) {
matrix[i] = i + 20;
}
}
#ifdef USE_CBLAS_VERITY
static void float_to_int16(uint16_t *int16_data, float *float_data, size_t size) {
for (size_t i = 0; i < size; i++) {
int16_data[i] = convert_fp32_bf16(float_data[i]);
}
}
static void float_to_int32(uint32_t *int32_data, float *float_data, size_t size) {
for (size_t i = 0; i < size; i++) {
int32_data[i] = (uint32_t)float_data[i];
}
}
#endif
// int8
static int _test_bmblas_gemm_bm1880v2(size_t M, size_t N, size_t K, cvk_fmt_t fmt) {
long elapsed;
struct timeval t0, t1;
int ret = 0;
uint8_t *i8_A = new uint8_t[M * K];
uint8_t *i8_B = new uint8_t[N * K];
uint8_t *i8_C = new uint8_t[4 * M * N]; // 32 bit output
uint32_t *i32bit_ref = new uint32_t[M * N];
assign_i8_values_to_matrix(i8_A, M * K);
assign_i8_values_to_matrix(i8_B, N * K);
float *float_A = new float[M * K];
float *float_B = new float[N * K];
float *float_C_ref = new float[M * N];
uint8_to_float(float_A, i8_A, M * K, fmt);
uint8_to_float(float_B, i8_B, N * K, fmt);
#if 0
printf("\nA:");
for (int i = 0; i < M; i++) {
printf("\n");
for (int j = 0; j < K; j++) {
printf("%e(0x%x) ", float_A[i * K + j], i8_A[i * K + j]);
}
}
printf("\nB:");
for (int i = 0; i < K; i++) {
printf("\n");
for (int j = 0; j < N; j++) {
printf("%e(0x%x) ", float_B[i * N + j], i8_B[i * N + j]);
}
}
printf("\nR:");
for (int i = 0; i < M; i++) {
printf("\n");
for (int j = 0; j < N; j++) {
printf("%e ", convert_i8_fp32(i32bit_ref[i * N + j]));
}
}
#endif
gettimeofday(&t0, NULL);
#ifdef USE_CBLAS_VERITY
float alpha = 0;
float beta = 0;
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha, float_A, K, float_B, N,
beta, float_C_ref, N);
float_to_int32(i32bit_ref, float_C_ref, M * N);
#else /* ! ifdef USE_CBLAS_VERITY */
multiply_i32(i8_A, M, K, i8_B, K, N, i32bit_ref, fmt);
#endif /* ifdef USE_CBLAS_VERITY */
gettimeofday(&t1, NULL);
elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
#ifdef USE_CBLAS_VERITY
printf("cblas GEMM takes %ld us\n", elapsed);
#else /* ! ifdef USE_CBLAS_VERITY */
printf("CPU GEMM takes %ld us\n", elapsed);
#endif /* ifdef USE_CBLAS_VERITY */
CVI_RT_HANDLE ctx;
cvk_context_t *bk_ctx;
test_init(&ctx, &bk_ctx);
// alloc device memory
cvk_mg_shape_t s_a = {(uint32_t)M, (uint32_t)K};
cvk_mg_shape_t s_b = {(uint32_t)K, (uint32_t)N};
cvk_mg_shape_t s_r = {2 * (uint32_t)M, 2 * (uint32_t)N};
size_t s_size_a = mg_shape_size(&s_a) * bytesize_of_fmt(fmt);
size_t s_size_b = mg_shape_size(&s_b) * bytesize_of_fmt(fmt);
size_t s_size_r = mg_shape_size(&s_r) * bytesize_of_fmt(fmt);
CVI_RT_MEM devmem_a = CVI_RT_MemAlloc(ctx, s_size_a);
CVI_RT_MEM devmem_b = CVI_RT_MemAlloc(ctx, s_size_b);
CVI_RT_MEM devmem_r = CVI_RT_MemAlloc(ctx, s_size_r);
gaddr_t gaddr_a = CVI_RT_MemGetPAddr(devmem_a);
gaddr_t gaddr_b = CVI_RT_MemGetPAddr(devmem_b);
gaddr_t gaddr_r = CVI_RT_MemGetPAddr(devmem_r);
// copy to device memory
CVI_RT_MemCopyS2D(ctx, devmem_a, (uint8_t *)i8_A);
CVI_RT_MemCopyS2D(ctx, devmem_b, (uint8_t *)i8_B);
CVI_RT_MemCopyS2D(ctx, devmem_r, (uint8_t *)i8_C);
// do computation with bmkernel
// bmruntime_bmkernel_create(ctx, (void**)&bk_ctx);
// printf("gaddr_a/gaddr_b/gaddr_r at %zx %zx %zx\n", gaddr_a, gaddr_b, gaddr_r);
size_t *slice_num = cvm_gemm((cvk_context_t *)bk_ctx, gaddr_a, gaddr_b, gaddr_r, M, K, N, fmt);
gettimeofday(&t0, NULL);
test_submit_comp(&ctx, bk_ctx);
gettimeofday(&t1, NULL);
elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
printf("TPU GEMM takes %ld us\n", elapsed);
CVI_RT_MemCopyD2S(ctx, (uint8_t *)i8_C, devmem_r);
CVI_RT_MemFree(ctx, devmem_a);
CVI_RT_MemFree(ctx, devmem_b);
CVI_RT_MemFree(ctx, devmem_r);
test_exit(&ctx, bk_ctx);
uint32_t *i32_C = new uint32_t[M * N]; // 32 bit output with stirded
cvm_combin_gemm_i8(slice_num, i8_C, i32_C, M, N);
free(slice_num);
int cmp_res = array_cmp_int32("gemm", i32bit_ref, i32_C, M * N);
if (cmp_res != 0) {
ret = -1;
printf("Comparison failed for cblas_sgemm and bmblas_gemm!\n");
#if 0
printf("\nref/cmd is:");
for (int i = 0; i < M; i++) {
printf(">\n");
for (int j = 0; j < N; j++) {
printf("%f(0x%x)/%f(0x%x) ",
convert_i8_fp32(i32bit_ref[i * N + j]), i32bit_ref[i * N + j],
convert_i8_fp32(i8_C[i * N + j]), i8_C[i * N + j]
);
}
}
#endif
} else {
// printf("Comparison done for cblas_sgemm and bmblas_gemm!\n\n");
}
delete[] float_A;
delete[] float_B;
delete[] float_C_ref;
delete[] i8_A;
delete[] i8_B;
delete[] i8_C;
delete[] i32bit_ref;
delete[] i32_C;
return ret;
}
int test_bmblas_gemm_bm1880v2(size_t M, size_t N, size_t K, cvk_fmt_t fmt) {
printf("%s: M=%zu, N=%zu, K=%zu, fmt_sz: %d\n", __func__, M, N, K, cvm_bytesize_of_fmt(fmt));
// FIXME: not duplicate
if (fmt != CVK_FMT_BF16) {
return _test_bmblas_gemm_bm1880v2(M, N, K, fmt);
}
long elapsed;
struct timeval t0, t1;
int ret = 0;
uint16_t *bf16_A = new uint16_t[M * K];
uint16_t *bf16_B = new uint16_t[N * K];
uint16_t *bf16_C = new uint16_t[2 * M * N];
uint16_t *int16_C_ref = new uint16_t[M * N];
assign_bf16_values_to_matrix(bf16_A, M * K);
assign_bf16_values_to_matrix(bf16_B, N * K);
float *float_A = new float[M * K];
float *float_B = new float[N * K];
float *float_C_ref = new float[M * N];
uint16_to_float(float_A, bf16_A, M * K);
uint16_to_float(float_B, bf16_B, N * K);
#if 0
printf("\nA:");
for (int i = 0; i < M; i++) {
printf("\n");
for (int j = 0; j < K; j++) {
printf("%e(0x%x) ", float_A[i * K + j], bf16_A[i * K + j]);
}
}
printf("\nB:");
for (int i = 0; i < K; i++) {
printf("\n");
for (int j = 0; j < N; j++) {
printf("%e(0x%x) ", float_B[i * N + j], bf16_B[i * N + j]);
}
}
printf("\nR:");
for (int i = 0; i < M; i++) {
printf("\n");
for (int j = 0; j < N; j++) {
printf("%e ", convert_bf16_fp32(int16_C_ref[i * N + j]));
}
}
#endif
gettimeofday(&t0, NULL);
#ifdef USE_CBLAS_VERITY
float alpha = 0;
float beta = 0;
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha, float_A, K, float_B, N,
beta, float_C_ref, N);
float_to_int16(int16_C_ref, float_C_ref, M * N);
#else /* ! ifdef USE_CBLAS_VERITY */
multiply(bf16_A, M, K, bf16_B, K, N, int16_C_ref);
#endif /* ifdef USE_CBLAS_VERITY */
delete[] float_A;
delete[] float_B;
delete[] float_C_ref;
gettimeofday(&t1, NULL);
elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
#ifdef USE_CBLAS_VERITY
printf("cblas GEMM takes %ld us\n", elapsed);
#else
printf("CPU GEMM takes %ld us\n", elapsed);
#endif
CVI_RT_HANDLE ctx;
cvk_context_t *bk_ctx;
test_init(&ctx, &bk_ctx);
// alloc device memory
cvk_mg_shape_t s_a = {(uint32_t)M, (uint32_t)K};
cvk_mg_shape_t s_b = {(uint32_t)K, (uint32_t)N};
cvk_mg_shape_t s_r = {(uint32_t)M, (uint32_t)N};
size_t s_size_a = mg_shape_size(&s_a) * bytesize_of_fmt(fmt);
size_t s_size_b = mg_shape_size(&s_b) * bytesize_of_fmt(fmt);
size_t s_size_r = mg_shape_size(&s_r) * bytesize_of_fmt(fmt) * bytesize_of_fmt(fmt);
CVI_RT_MEM devmem_a = CVI_RT_MemAlloc(ctx, s_size_a);
CVI_RT_MEM devmem_b = CVI_RT_MemAlloc(ctx, s_size_b);
CVI_RT_MEM devmem_r = CVI_RT_MemAlloc(ctx, s_size_r);
gaddr_t gaddr_a = CVI_RT_MemGetPAddr(devmem_a);
gaddr_t gaddr_b = CVI_RT_MemGetPAddr(devmem_b);
gaddr_t gaddr_r = CVI_RT_MemGetPAddr(devmem_r);
// copy to device memory
CVI_RT_MemCopyS2D(ctx, devmem_a, (uint8_t *)bf16_A);
CVI_RT_MemCopyS2D(ctx, devmem_b, (uint8_t *)bf16_B);
CVI_RT_MemCopyS2D(ctx, devmem_r, (uint8_t *)bf16_C);
// do computation with bmkernel
// bmruntime_bmkernel_create(ctx, (void**)&bk_ctx);
size_t *slice_num =
cvm_gemm((cvk_context_t *)bk_ctx, gaddr_a, gaddr_b, gaddr_r, M, K, N, CVK_FMT_BF16);
free(slice_num); // no use slice_num infomation in BF16
gettimeofday(&t0, NULL);
test_submit_comp(&ctx, bk_ctx);
gettimeofday(&t1, NULL);
elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
printf("TPU GEMM takes %ld us\n", elapsed);
CVI_RT_MemCopyD2S(ctx, (uint8_t *)bf16_C, devmem_r);
// bmruntime_bmkernel_destroy(ctx);
CVI_RT_MemFree(ctx, devmem_a);
CVI_RT_MemFree(ctx, devmem_b);
CVI_RT_MemFree(ctx, devmem_r);
test_exit(&ctx, bk_ctx);
int cmp_res = array_cmp_int16("gemm", int16_C_ref, bf16_C, M * N);
if (cmp_res != 0) {
ret = -1;
printf("Comparison failed for cblas_sgemm and bmblas_gemm!\n");
#if 0
printf("\nref/cmd is:");
for (int i = 0; i < M; i++) {
printf(">\n");
for (int j = 0; j < N; j++) {
printf("%f(0x%x)/%f(0x%x) ",
convert_bf16_fp32(int16_C_ref[i * N + j]), int16_C_ref[i * N + j],
convert_bf16_fp32(bf16_C[i * N + j]), bf16_C[i * N + j]
);
}
}
#endif
} else {
// printf("Comparison done for cblas_sgemm and bmblas_gemm!\n\n");
}
delete[] bf16_A;
delete[] bf16_B;
delete[] bf16_C;
delete[] int16_C_ref;
return ret;
}
#define test_one_param(n) \
do { \
param_t p = param_##n(bk_ctx); \
test_param(&ctx, bk_ctx, &p); \
destroy_param(bk_ctx, &p); \
} while (0)
int main() {
int round_mode;
round_mode = set_store_feround();
CVI_RT_HANDLE ctx;
cvk_context_t *bk_ctx;
test_init(&ctx, &bk_ctx);
// int8 example
if (0 != test_bmblas_gemm_bm1880v2(1, 100, 512, CVK_FMT_I8)) exit(-1);
if (0 != test_bmblas_gemm_bm1880v2(2, 100, 512, CVK_FMT_I8)) exit(-1);
if (0 != test_bmblas_gemm_bm1880v2(4, 100, 512, CVK_FMT_I8)) exit(-1);
if (0 != test_bmblas_gemm_bm1880v2(8, 100, 512, CVK_FMT_I8)) exit(-1);
if (0 != test_bmblas_gemm_bm1880v2(1, 20000, 512, CVK_FMT_I8)) exit(-1);
if (0 != test_bmblas_gemm_bm1880v2(10, 200, 10, CVK_FMT_I8)) exit(-1);
if (0 != test_bmblas_gemm_bm1880v2(1, 200, 500, CVK_FMT_I8)) exit(-1);
if (0 != test_bmblas_gemm_bm1880v2(1, 20, 50, CVK_FMT_I8)) exit(-1);
if (0 != test_bmblas_gemm_bm1880v2(2, 10, 100, CVK_FMT_I8)) exit(-1);
if (0 != test_bmblas_gemm_bm1880v2(2, 1000, 5, CVK_FMT_I8)) exit(-1);
if (0 != test_bmblas_gemm_bm1880v2(20, 5, 5, CVK_FMT_I8)) exit(-1);
if (0 != test_bmblas_gemm_bm1880v2(2, 5, 5, CVK_FMT_I8)) exit(-1);
cvk_fmt_t fmts[2] = {CVK_FMT_BF16, CVK_FMT_I8};
// cvk_fmt_t fmts[1] = {CVK_FMT_BF16};
int fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
for (int i = 0; i < fmts_sz; i++) {
cvk_fmt_t fmt = fmts[i];
if (0) {
// backend implement
for (int i = 0; i < 30; i++) test_one_param(0);
} else {
// gemm, plz refer bmtap2/libbmblas
int M = 10000;
int N = 10000;
int K = 1024;
M = 2000;
N = 2000;
int m, k, n;
if (0) {
for (m = 1; m <= M; m *= 10) {
for (n = 1; n <= N; n += 200) {
for (k = 1; k <= K; k *= 2) {
if (0 != test_bmblas_gemm_bm1880v2(m, n, k, fmt)) {
exit(-1);
}
}
}
}
}
if (1) {
if (0 != test_bmblas_gemm_bm1880v2(1, 500, 512, fmt)) exit(-1);
if (0 != test_bmblas_gemm_bm1880v2(1, 750, 512, fmt)) exit(-1);
if (0 != test_bmblas_gemm_bm1880v2(1, 100, 512, fmt)) exit(-1);
if (0 != test_bmblas_gemm_bm1880v2(2, 100, 512, fmt)) exit(-1);
if (0 != test_bmblas_gemm_bm1880v2(4, 100, 512, fmt)) exit(-1);
if (0 != test_bmblas_gemm_bm1880v2(8, 100, 512, fmt)) exit(-1);
// if (0 != test_bmblas_gemm_bm1880v2(1, 50000, 512, fmt)) exit(-1);
// if (0 != test_bmblas_gemm_bm1880v2(1, 75000, 512, fmt)) exit(-1);
// if (0 != test_bmblas_gemm_bm1880v2(1, 10000, 512, fmt)) exit(-1);
// if (0 != test_bmblas_gemm_bm1880v2(2, 10000, 512, fmt)) exit(-1);
// if (0 != test_bmblas_gemm_bm1880v2(4, 10000, 512, fmt)) exit(-1);
// if (0 != test_bmblas_gemm_bm1880v2(8, 10000, 512, fmt)) exit(-1);
}
printf("Comparison done for cblas_sgemm and bmblas_gemm!\n\n");
}
}
test_exit(&ctx, bk_ctx);
restore_feround(round_mode);
return 0;
}

View File

@ -0,0 +1,158 @@
#include <cvimath_internal.h>
#include <test_cvikernel_util.h>
#define OUT
#define IN
#include <cfloat>
#include <iomanip>
#include <iostream>
#include <map>
#include <random>
#include <string>
//#define DBG
using namespace std;
/**
* pre_data means we test fixed pattern, it should be same sa lut
*/
// enum TEST_MODE {
// CVM_MASK_TYPE_GT_0 = 0, // remain > 0
// //CVM_MASK_TYPE_GE_0, // remain >= 0
// //CVM_MASK_TYPE_EQ_0, // remain = 0
// //CVM_MASK_TYPE_LT_0, // remain < 0
// //CVM_MASK_TYPE_LE_0, // remain <= 0
// CVM_MASK_MAX
//};
enum CVM_MASK_TYPE mode;
struct pattern {
float *input;
float *ref;
int len;
};
#define SIZEOF(x) (sizeof(x) / sizeof(x[0]))
float cvm_mask_type_gt_0_input[] = {-1 * pow(2, -62), -0.003, -1.0, -100000, 0.000001, 1, 1000,
pow(2, 62), 0};
float cvm_mask_type_gt_0_output[] = {0, 0, 0, 0, 1, 1, 1, 1, 0};
float cvm_mask_type_ge_0_output[] = {0, 0, 0, 0, 1, 1, 1, 1, 1};
float cvm_mask_type_eq_0_output[] = {0, 0, 0, 0, 0, 0, 0, 0, 1};
float cvm_mask_type_lt_0_output[] = {1, 1, 1, 1, 0, 0, 0, 0, 0};
float cvm_mask_type_le_0_output[] = {1, 1, 1, 1, 0, 0, 0, 0, 1};
int input_sz = sizeof(cvm_mask_type_gt_0_input) / sizeof(cvm_mask_type_gt_0_input[0]);
static struct pattern patterns[] = {
{cvm_mask_type_gt_0_input, cvm_mask_type_gt_0_output, input_sz},
{cvm_mask_type_gt_0_input, cvm_mask_type_ge_0_output, input_sz},
{cvm_mask_type_gt_0_input, cvm_mask_type_eq_0_output, input_sz},
{cvm_mask_type_gt_0_input, cvm_mask_type_lt_0_output, input_sz},
{cvm_mask_type_gt_0_input, cvm_mask_type_le_0_output, input_sz},
};
static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) {
cvk_fmt_t fmt = CVK_FMT_BF16;
struct pattern *p = &patterns[mode];
uint32_t input_n = 1;
uint32_t input_c = 1;
uint32_t input_h = 1;
uint32_t input_w = p->len;
cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
cvk_tl_shape_t ofmap_shape = ifmap_shape;
uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
int data_type_size = bytesize_of_fmt(fmt);
uint64_t ifmap_bytesize = ifmap_size * data_type_size;
uint64_t ofmap_bytesize = ofmap_size * data_type_size;
cvk_tl_shape_t table_shape;
uint64_t table_bytesize = cvm_lut_tbl_bytesize(bmk, &table_shape, fmt);
cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
cvk_tl_t *out = tl_ofmap_bf16;
cvk_tl_t *tl_pos_neg_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_0_idx_table = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
// temp buf
cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_buf2 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1);
cvk_tl_t *tl_buf4 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1);
uint16_t *input_data = (uint16_t *)xmalloc(ifmap_bytesize);
uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
uint16_t *table_data_atan_pos_neg = (uint16_t *)xmalloc(table_bytesize);
uint16_t *idx_0_table_data = (uint16_t *)xmalloc(table_bytesize);
cvm_gen_0_tbl(idx_0_table_data, &table_shape);
cvm_pos_neg_tbl(table_data_atan_pos_neg, &table_shape);
for (uint32_t i = 0; i < ifmap_size; i++) {
input_data[i] = convert_fp32_bf16(p->input[i]);
ref_data[i] = convert_fp32_bf16(p->ref[i]);
}
test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)input_data);
test_put_tensor_g2l_comp(ctx, bmk, tl_pos_neg_buf, (uint8_t *)table_data_atan_pos_neg);
test_put_tensor_g2l_comp(ctx, bmk, tl_0_idx_table, (uint8_t *)idx_0_table_data);
cvm_emit_mask(bmk, tl_ifmap, tl_buf, tl_buf2, tl_buf4, tl_pos_neg_buf, tl_0_idx_table, out, fmt,
mode);
test_submit_comp(ctx, bmk);
uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, out);
for (uint32_t i = 0; i < ifmap_size; i++) {
if (ref_data[i] != ofmap_data[i]) {
fprintf(stderr,
"comparing failed at mode %d ofmap_data[%u] got %f(0x%x), ref "
"%f(0x%x)\n",
mode, i, convert_bf16_fp32(ofmap_data[i]), ofmap_data[i],
convert_bf16_fp32(ref_data[i]), ref_data[i]);
exit(-1);
}
}
#if 0
if (!is_close) {
float input = convert_bf16_fp32(ifmap[i]);
}
#endif
free_tl(bmk, tl_buf4);
free_tl(bmk, tl_buf2);
free_tl(bmk, tl_buf);
free_tl(bmk, tl_0_idx_table);
free_tl(bmk, tl_pos_neg_buf);
free_tl(bmk, tl_ofmap_bf16);
free_tl(bmk, tl_ifmap);
free(input_data);
free(ref_data);
free(ofmap_data);
free(table_data_atan_pos_neg);
free(idx_0_table_data);
}
int main() {
CVI_RT_HANDLE ctx;
cvk_context_t *bmk;
int round_mode;
round_mode = set_store_feround();
test_init(&ctx, &bmk);
for (int i = CVM_MASK_TYPE_GT_0; i < CVM_MASK_MAX; i++) {
mode = static_cast<enum CVM_MASK_TYPE>(i);
printf("test mode %d...\n", mode);
testbench(&ctx, bmk);
}
test_exit(&ctx, bmk);
restore_feround(round_mode);
return 0;
}

View File

@ -0,0 +1,376 @@
/**
*/
#include <cvimath_internal.h>
#include <test_cvikernel_util.h>
#include <cfloat>
#include <iomanip>
#include <iostream>
#include <map>
#include <random>
#include <string>
//#define DBG
using namespace std;
/**
* pre_data means we test fixed pattern, it should be same sa lut
*/
enum TEST_MODE {
PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare
GEN_POW_20_DATA_MAX_ERROR, // generate 2^-20 ~ 2^20 value that check epsilon
TEST_MODE_MAX,
};
static TEST_MODE mode;
static uint16_t test_pattern[] = {
0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52, 0x3A6C, 0x3A83, 0x3A90,
0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF, 0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17,
0x3B1D, 0x3B24, 0x3B2A, 0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65,
0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90, 0x3B93, 0x3B97, 0x3B9A,
0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE, 0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1,
0x3BC5, 0x3BC8, 0x3BCB, 0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9,
0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03, 0x3C05, 0x3C06, 0x3C08,
0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12, 0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C,
0x3C1D, 0x3C1F, 0x3C21, 0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F,
0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E, 0x3C40, 0x3C41, 0x3C43,
0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D, 0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57,
0x3C58, 0x3C5A, 0x3C5C, 0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A,
0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79, 0x3C7B, 0x3C7C, 0x3C7E,
0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84, 0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89,
0x3C8A, 0x3C8A, 0x3C8B, 0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93,
0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A, 0x3C9B, 0x3C9C, 0x3C9C,
0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1, 0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6,
0x3CA7, 0x3CA8, 0x3CA9, 0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0,
0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8, 0x3CB8, 0x3CB9, 0x3CBA,
0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF, 0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4,
0x3CC5, 0x3CC5, 0x3CC6, 0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE,
0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5, 0x3CD6, 0x3CD7, 0x3CD7,
0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC, 0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1,
0x3CE2, 0x3CE3, 0x3CE4, 0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB,
0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2, 0x3CF3, 0x3CF4, 0x3CF5,
0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA, 0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF,
0x3D00, 0x3D00, 0x3D01, 0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04,
0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08, 0x3D08, 0x3D09, 0x3D09,
0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C, 0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E,
0x3D0F, 0x3D0F, 0x3D0F, 0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13,
0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17, 0x3D17, 0x3D18, 0x3D18,
0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A, 0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D,
0x3D1D, 0x3D1E, 0x3D1E, 0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22,
0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25, 0x3D26, 0x3D26, 0x3D27,
0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29, 0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C,
0x3D2C, 0x3D2C, 0x3D2D, 0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31,
0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34, 0x3D35, 0x3D35, 0x3D35,
0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38, 0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A,
0x3D3B, 0x3D3B, 0x3D3C, 0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F,
0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43, 0x3D43, 0x3D44, 0x3D44,
0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47, 0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49,
0x3D4A, 0x3D4A, 0x3D4A, 0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E,
0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52, 0x3D52, 0x3D53, 0x3D53,
0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55, 0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58,
0x3D58, 0x3D59, 0x3D59, 0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D,
0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60, 0x3D61, 0x3D61, 0x3D62,
0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64, 0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67,
0x3D67, 0x3D67, 0x3D68, 0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C,
0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F, 0x3D70, 0x3D70, 0x3D70,
0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73, 0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75,
0x3D76, 0x3D76, 0x3D77, 0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A,
0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E, 0x3D7E, 0x3D7F, 0x3D7F,
0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82,
0x3D82, 0x3D82, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85,
0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D87, 0x3D87, 0x3D87,
0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89,
0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C,
0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E,
0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91,
0x3D91, 0x3D91, 0x3D91, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93,
0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95, 0x3D95, 0x3D96, 0x3D96,
0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98,
0x3D98, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B,
0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9D, 0x3D9D, 0x3D9D,
0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0,
0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2,
0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4,
0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7,
0x3DA7, 0x3DA7, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9,
0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAC, 0x3DAC,
0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE,
0x3DAE, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1,
0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3,
0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6,
0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8,
0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBB,
0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD,
0x3DBD, 0x3DBD, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF,
0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC2, 0x3DC2, 0x3DC2,
0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4,
0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7,
0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9, 0x3DC9, 0x3DC9, 0x3DC9,
0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC,
0x3DCC, 0x3DCC, 0x3DCC, 0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5,
0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE, 0x3DDF, 0x3DE0, 0x3DE1,
0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5,
};
static uint16_t test_pattern_ref[] = {
0x7f7f, 0x461c, 0x459c, 0x4551, 0x451c, 0x44fa, 0x44d1, 0x44b2, 0x449c, 0x448b, 0x447a, 0x4464,
0x4451, 0x4441, 0x4432, 0x4426, 0x441c, 0x4413, 0x440b, 0x4404, 0x43fa, 0x43ed, 0x43e4, 0x43d9,
0x43d1, 0x43c8, 0x43c1, 0x43b9, 0x43b2, 0x43ac, 0x43a6, 0x43a1, 0x439c, 0x4398, 0x4393, 0x438f,
0x438b, 0x4387, 0x4384, 0x4380, 0x437a, 0x4375, 0x436d, 0x4368, 0x4364, 0x435f, 0x4359, 0x4355,
0x4351, 0x434c, 0x4348, 0x4344, 0x4341, 0x433c, 0x4339, 0x4336, 0x4332, 0x432f, 0x432c, 0x432a,
0x4326, 0x4324, 0x4321, 0x431f, 0x431c, 0x431a, 0x4318, 0x4315, 0x4313, 0x4311, 0x430f, 0x430d,
0x430b, 0x4309, 0x4307, 0x4305, 0x4304, 0x4302, 0x4300, 0x42fe, 0x42fa, 0x42f6, 0x42f5, 0x42f1,
0x42ed, 0x42ec, 0x42e8, 0x42e5, 0x42e4, 0x42e0, 0x42df, 0x42dc, 0x42d9, 0x42d8, 0x42d5, 0x42d2,
0x42d1, 0x42ce, 0x42cc, 0x42ca, 0x42c8, 0x42c7, 0x42c4, 0x42c2, 0x42c1, 0x42bf, 0x42bc, 0x42bb,
0x42b9, 0x42b7, 0x42b6, 0x42b4, 0x42b2, 0x42b1, 0x42af, 0x42ae, 0x42ac, 0x42ab, 0x42aa, 0x42a8,
0x42a6, 0x42a5, 0x42a4, 0x42a2, 0x42a1, 0x42a0, 0x429f, 0x429e, 0x429c, 0x429b, 0x429a, 0x4298,
0x4298, 0x4296, 0x4295, 0x4294, 0x4293, 0x4292, 0x4291, 0x4290, 0x428f, 0x428e, 0x428d, 0x428c,
0x428b, 0x428a, 0x4289, 0x4288, 0x4287, 0x4286, 0x4285, 0x4285, 0x4284, 0x4283, 0x4282, 0x4281,
0x4280, 0x427e, 0x427e, 0x427c, 0x427a, 0x4278, 0x4276, 0x4275, 0x4275, 0x4273, 0x4271, 0x426f,
0x426d, 0x426d, 0x426c, 0x426a, 0x4268, 0x4267, 0x4265, 0x4265, 0x4264, 0x4262, 0x4260, 0x425f,
0x425f, 0x425d, 0x425c, 0x425a, 0x4259, 0x4258, 0x4258, 0x4256, 0x4255, 0x4253, 0x4252, 0x4252,
0x4251, 0x424f, 0x424e, 0x424d, 0x424c, 0x424c, 0x424a, 0x4249, 0x4248, 0x4247, 0x4247, 0x4245,
0x4244, 0x4243, 0x4242, 0x4241, 0x4241, 0x4240, 0x423f, 0x423d, 0x423c, 0x423c, 0x423b, 0x423a,
0x4239, 0x4238, 0x4237, 0x4237, 0x4236, 0x4235, 0x4234, 0x4233, 0x4232, 0x4232, 0x4231, 0x4230,
0x422f, 0x422e, 0x422e, 0x422d, 0x422c, 0x422c, 0x422b, 0x422a, 0x422a, 0x4229, 0x4228, 0x4227,
0x4226, 0x4226, 0x4225, 0x4225, 0x4224, 0x4223, 0x4222, 0x4222, 0x4221, 0x4221, 0x4220, 0x421f,
0x421f, 0x421e, 0x421e, 0x421d, 0x421c, 0x421b, 0x421b, 0x421b, 0x421a, 0x4219, 0x4218, 0x4218,
0x4218, 0x4217, 0x4216, 0x4216, 0x4215, 0x4215, 0x4214, 0x4214, 0x4213, 0x4212, 0x4212, 0x4212,
0x4211, 0x4210, 0x4210, 0x420f, 0x420f, 0x420e, 0x420e, 0x420d, 0x420d, 0x420d, 0x420c, 0x420b,
0x420b, 0x420a, 0x420a, 0x420a, 0x4209, 0x4209, 0x4208, 0x4207, 0x4207, 0x4207, 0x4206, 0x4206,
0x4205, 0x4205, 0x4205, 0x4204, 0x4204, 0x4203, 0x4203, 0x4203, 0x4202, 0x4202, 0x4201, 0x4201,
0x4200, 0x4200, 0x41fe, 0x41fe, 0x41fe, 0x41fc, 0x41fc, 0x41fa, 0x41fa, 0x41fa, 0x41f8, 0x41f8,
0x41f6, 0x41f6, 0x41f5, 0x41f5, 0x41f5, 0x41f3, 0x41f3, 0x41f1, 0x41f1, 0x41f1, 0x41ef, 0x41ef,
0x41ed, 0x41ed, 0x41ed, 0x41ec, 0x41ec, 0x41ea, 0x41ea, 0x41ea, 0x41e8, 0x41e8, 0x41e7, 0x41e7,
0x41e5, 0x41e5, 0x41e5, 0x41e4, 0x41e4, 0x41e2, 0x41e2, 0x41e2, 0x41e0, 0x41e0, 0x41df, 0x41df,
0x41df, 0x41dd, 0x41dd, 0x41dc, 0x41dc, 0x41da, 0x41da, 0x41da, 0x41d9, 0x41d9, 0x41d8, 0x41d8,
0x41d8, 0x41d6, 0x41d6, 0x41d5, 0x41d5, 0x41d5, 0x41d3, 0x41d3, 0x41d2, 0x41d2, 0x41d2, 0x41d1,
0x41d1, 0x41cf, 0x41cf, 0x41ce, 0x41ce, 0x41ce, 0x41cd, 0x41cd, 0x41cc, 0x41cc, 0x41cc, 0x41ca,
0x41ca, 0x41c9, 0x41c9, 0x41c9, 0x41c8, 0x41c8, 0x41c7, 0x41c7, 0x41c7, 0x41c5, 0x41c5, 0x41c4,
0x41c4, 0x41c3, 0x41c3, 0x41c3, 0x41c2, 0x41c2, 0x41c1, 0x41c1, 0x41c1, 0x41c0, 0x41c0, 0x41bf,
0x41bf, 0x41bf, 0x41bd, 0x41bd, 0x41bc, 0x41bc, 0x41bc, 0x41bb, 0x41bb, 0x41ba, 0x41ba, 0x41b9,
0x41b9, 0x41b9, 0x41b8, 0x41b8, 0x41b7, 0x41b7, 0x41b7, 0x41b6, 0x41b6, 0x41b5, 0x41b5, 0x41b5,
0x41b4, 0x41b4, 0x41b3, 0x41b3, 0x41b2, 0x41b2, 0x41b2, 0x41b1, 0x41b1, 0x41b0, 0x41b0, 0x41b0,
0x41af, 0x41af, 0x41ae, 0x41ae, 0x41ae, 0x41ad, 0x41ad, 0x41ac, 0x41ac, 0x41ac, 0x41ac, 0x41ac,
0x41ab, 0x41ab, 0x41aa, 0x41aa, 0x41aa, 0x41a9, 0x41a9, 0x41a8, 0x41a8, 0x41a8, 0x41a7, 0x41a7,
0x41a6, 0x41a6, 0x41a6, 0x41a5, 0x41a5, 0x41a5, 0x41a5, 0x41a5, 0x41a4, 0x41a4, 0x41a3, 0x41a3,
0x41a2, 0x41a2, 0x41a2, 0x41a1, 0x41a1, 0x41a1, 0x41a1, 0x41a1, 0x41a0, 0x41a0, 0x419f, 0x419f,
0x419f, 0x419e, 0x419e, 0x419e, 0x419e, 0x419e, 0x419d, 0x419d, 0x419c, 0x419c, 0x419b, 0x419b,
0x419b, 0x419b, 0x419b, 0x419a, 0x419a, 0x419a, 0x4199, 0x4199, 0x4198, 0x4198, 0x4198, 0x4198,
0x4198, 0x4197, 0x4197, 0x4197, 0x4196, 0x4196, 0x4196, 0x4196, 0x4195, 0x4195, 0x4195, 0x4194,
0x4194, 0x4194, 0x4194, 0x4194, 0x4193, 0x4193, 0x4192, 0x4192, 0x4192, 0x4192, 0x4192, 0x4191,
0x4191, 0x4190, 0x4190, 0x4190, 0x4190, 0x4190, 0x418f, 0x418f, 0x418f, 0x418e, 0x418e, 0x418e,
0x418e, 0x418e, 0x418d, 0x418d, 0x418d, 0x418d, 0x418d, 0x418c, 0x418c, 0x418b, 0x418b, 0x418b,
0x418b, 0x418b, 0x418a, 0x418a, 0x418a, 0x418a, 0x418a, 0x4189, 0x4189, 0x4189, 0x4189, 0x4189,
0x4188, 0x4188, 0x4187, 0x4187, 0x4187, 0x4187, 0x4187, 0x4186, 0x4186, 0x4186, 0x4186, 0x4186,
0x4185, 0x4185, 0x4185, 0x4185, 0x4185, 0x4184, 0x4184, 0x4184, 0x4184, 0x4184, 0x4183, 0x4183,
0x4183, 0x4183, 0x4183, 0x4182, 0x4182, 0x4182, 0x4182, 0x4181, 0x4181, 0x4181, 0x4181, 0x4181,
0x4180, 0x4180, 0x4180, 0x4180, 0x417e, 0x417e, 0x417e, 0x417e, 0x417e, 0x417c, 0x417c, 0x417c,
0x417c, 0x417c, 0x417a, 0x417a, 0x417a, 0x417a, 0x417a, 0x4178, 0x4178, 0x4178, 0x4178, 0x4176,
0x4176, 0x4176, 0x4176, 0x4176, 0x4175, 0x4175, 0x4175, 0x4175, 0x4175, 0x4173, 0x4173, 0x4173,
0x4173, 0x4173, 0x4171, 0x4171, 0x4171, 0x4171, 0x4171, 0x416f, 0x416f, 0x416f, 0x416f, 0x416f,
0x416d, 0x416d, 0x416d, 0x416d, 0x416d, 0x416c, 0x416c, 0x416c, 0x416c, 0x416c, 0x416a, 0x416a,
0x416a, 0x416a, 0x416a, 0x4168, 0x4168, 0x4168, 0x4168, 0x4167, 0x4167, 0x4167, 0x4167, 0x4167,
0x4165, 0x4165, 0x4165, 0x4165, 0x4165, 0x4164, 0x4164, 0x4164, 0x4164, 0x4164, 0x4162, 0x4162,
0x4162, 0x4162, 0x4162, 0x4160, 0x4160, 0x4160, 0x4160, 0x4160, 0x415f, 0x415f, 0x415f, 0x415f,
0x415f, 0x415d, 0x415d, 0x415d, 0x415d, 0x415d, 0x415c, 0x415c, 0x415c, 0x415c, 0x415a, 0x415a,
0x415a, 0x415a, 0x415a, 0x4159, 0x4159, 0x4159, 0x4159, 0x4159, 0x4158, 0x4158, 0x4158, 0x4158,
0x4158, 0x4156, 0x4156, 0x4156, 0x4156, 0x4156, 0x4155, 0x4155, 0x4155, 0x4155, 0x4155, 0x4153,
0x4153, 0x4153, 0x4153, 0x4153, 0x4152, 0x4152, 0x4152, 0x4152, 0x4152, 0x4151, 0x4151, 0x4151,
0x4151, 0x4151, 0x414f, 0x414f, 0x414f, 0x414f, 0x414e, 0x414e, 0x414e, 0x414e, 0x414e, 0x414d,
0x414d, 0x414d, 0x414d, 0x414d, 0x414c, 0x414c, 0x414c, 0x414c, 0x414c, 0x414a, 0x414a, 0x414a,
0x414a, 0x414a, 0x4149, 0x4149, 0x4149, 0x4149, 0x4149, 0x4148, 0x4148, 0x4148, 0x4148, 0x4148,
0x4147, 0x4147, 0x4147, 0x4147, 0x4147, 0x4145, 0x4145, 0x4145, 0x4145, 0x4144, 0x4144, 0x4144,
0x4144, 0x4144, 0x4143, 0x4143, 0x4143, 0x4143, 0x4143, 0x4142, 0x4142, 0x4142, 0x4142, 0x4142,
0x4141, 0x4141, 0x4141, 0x4141, 0x4141, 0x4140, 0x4140, 0x4140, 0x4140, 0x4140, 0x413f, 0x413f,
0x413f, 0x413f, 0x413f, 0x413d, 0x413d, 0x413d, 0x413d, 0x413d, 0x413c, 0x413c, 0x413c, 0x413c,
0x413c, 0x413b, 0x413b, 0x413b, 0x413b, 0x413a, 0x413a, 0x413a, 0x413a, 0x413a, 0x4139, 0x4139,
0x4139, 0x4139, 0x4139, 0x4138, 0x4138, 0x4138, 0x4138, 0x4138, 0x4137, 0x4137, 0x4137, 0x4137,
0x4137, 0x4136, 0x4136, 0x4136, 0x4136, 0x4136, 0x4135, 0x4135, 0x4135, 0x4135, 0x4135, 0x4134,
0x4134, 0x4134, 0x4134, 0x4134, 0x4133, 0x4133, 0x4133, 0x4133, 0x4132, 0x4132, 0x4132, 0x4132,
0x4132, 0x4131, 0x4131, 0x4131, 0x4131, 0x4131, 0x4130, 0x4130, 0x4130, 0x4130, 0x4130, 0x412f,
0x412f, 0x412f, 0x412f, 0x412f, 0x412e, 0x412e, 0x412e, 0x412e, 0x412e, 0x412d, 0x412d, 0x412d,
0x412d, 0x412d, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c,
0x412b, 0x412b, 0x412b, 0x412b, 0x412a, 0x412a, 0x412a, 0x412a, 0x412a, 0x4129, 0x4129, 0x4129,
0x4129, 0x4129, 0x4128, 0x4128, 0x4128, 0x4128, 0x4128, 0x4127, 0x4127, 0x4127, 0x4127, 0x4127,
0x4126, 0x4126, 0x4126, 0x4126, 0x4126, 0x4125, 0x4125, 0x4125, 0x4125, 0x4125, 0x4125, 0x4125,
0x4125, 0x4125, 0x4125, 0x4124, 0x4124, 0x4124, 0x4124, 0x4124, 0x4123, 0x4123, 0x4123, 0x4123,
0x4122, 0x4122, 0x4122, 0x4122, 0x4122, 0x4121, 0x4121, 0x4121, 0x4121, 0x4121, 0x4121, 0x4121,
0x4121, 0x4121, 0x4121, 0x4120, 0x411f, 0x411e, 0x411e, 0x411d, 0x411c, 0x411b, 0x411b, 0x411a,
0x4119, 0x4118, 0x4118, 0x4117, 0x4116, 0x4116, 0x4115, 0x4114, 0x4114, 0x4113, 0x4112, 0x4112,
0x4111, 0x4110, 0x4110, 0x410f,
};
static void tl_lut_ref(uint16_t *ofmap, uint16_t *ifmap, cvk_tl_shape_t ifmap_shape) {
for (uint32_t i = 0; i < tl_shape_size(&ifmap_shape); i++) {
if (mode == PRE_DATA_COMPARE_FIX) {
ofmap[i] = test_pattern_ref[i];
} else {
uint16_t v = convert_fp32_bf16(1 / (1.0 * (convert_bf16_fp32(ifmap[i]))));
ofmap[i] = v;
}
}
}
static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint16_t *ifmap,
uint64_t ifmap_shape_size, TEST_MODE mode) {
uint64_t size = ifmap_shape_size;
for (uint64_t i = 0; i < size; i++) {
bool is_close;
uint16_t ref;
uint16_t ofmap_data_bf16;
float ref_f;
float ofmap_data_f;
ref = ref_data[i];
ref_f = convert_bf16_fp32(ref);
ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
ofmap_data_bf16 = ofmap_data[i];
if (mode == PRE_DATA_COMPARE_FIX) {
is_close = ofmap_data[i] == ref;
} else {
is_close = fabs(ref_f - ofmap_data_f) < 0.001;
}
if (!is_close) {
fprintf(stderr,
"comparing failed at ofmap_data[%lu](input:%e), got %x, exp %x, "
"fp32: got %e exp %e\n",
i, convert_bf16_fp32(ifmap[i]), ofmap_data_bf16, ref, ofmap_data_f, ref_f);
exit(-1);
}
}
return true;
}
static void gen_input(uint16_t *ifmap, uint64_t ifmap_shape_size) {
if (mode == PRE_DATA_COMPARE_FIX) {
memcpy(ifmap, &test_pattern, sizeof(test_pattern));
} else {
for (uint64_t i = 0; i < ifmap_shape_size; i++) {
srand(static_cast<unsigned>(time(0)));
std::random_device rd;
std::mt19937 e2(rd());
float LO = pow(2, -10);
float HI = pow(2, 10);
// std::uniform_real_distribution<> dist(pow(2,-62), pow(2,63));
for (uint64_t i = 0; i < ifmap_shape_size; i++) {
// float r3 = dist(e2);
float r3 = LO + static_cast<float>(rand()) / (static_cast<float>(RAND_MAX / (HI - LO)));
ifmap[i] = convert_fp32_bf16(r3);
}
}
}
#ifdef DBG
for (uint64_t i = 0; i < ifmap_shape_size; i++) {
printf("source if[%lu] bf16 %f 0x%x, log2f is %f\n", i, convert_bf16_fp32(ifmap[i]), ifmap[i],
floor(log2((convert_bf16_fp32(ifmap[i])))));
}
#endif /* ifdef DBG */
}
static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk, uint32_t input_n, uint32_t input_c,
uint32_t input_h, uint32_t input_w) {
cvk_fmt_t fmt = CVK_FMT_BF16;
// TODO: check more shape / align
cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
cvk_tl_shape_t ofmap_shape = ifmap_shape;
cvk_tl_shape_t table_shape;
cvm_table_shape(bmk, &table_shape);
uint64_t ifmap_shape_size = tl_shape_size(&ifmap_shape);
uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
uint64_t table_size = tl_shape_size(&table_shape);
// prepare input data with size
int data_type_size = bytesize_of_fmt(fmt);
uint64_t ifmap_bytesize = ifmap_shape_size * data_type_size;
uint64_t ofmap_bytesize = ofmap_size * data_type_size;
uint64_t table_bytesize = table_size * data_type_size;
uint16_t *ifmap = (uint16_t *)xmalloc(ifmap_bytesize);
uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
uint16_t *table_data = (uint16_t *)xmalloc(table_bytesize);
uint16_t *table_data_mantissa = (uint16_t *)xmalloc(table_bytesize);
// alloc lmem
cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_buf = test_alloc_tl(bmk, tl_ifmap->shape, fmt, /*align*/ 1);
cvk_tl_t *cvk_tl_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *cvk_tl_table_answer_mantissa = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
// generate testbench
gen_input(ifmap, ifmap_shape_size);
tl_lut_ref(ref_data, ifmap, ifmap_shape);
// prepare table
cvm_reciprocal_tbl(table_data, table_data_mantissa, &table_shape);
// sys->lmem
test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)ifmap);
test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer, (uint8_t *)table_data);
test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer_mantissa, (uint8_t *)table_data_mantissa);
cvm_emit_reciprocal(bmk, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_mantissa,
tl_ofmap_bf16);
// issue cmd
test_submit_comp(ctx, bmk);
// get output from lmem->sys
uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, tl_ofmap_bf16);
verify(ofmap_data, ref_data, ifmap, ifmap_shape_size, mode);
free_tl(bmk, cvk_tl_table_answer_mantissa);
free_tl(bmk, cvk_tl_table_answer);
free_tl(bmk, tl_buf);
free_tl(bmk, tl_ofmap_bf16);
free_tl(bmk, tl_ifmap);
free(ifmap);
free(ref_data);
free(ofmap_data);
free(table_data);
free(table_data_mantissa);
}
int main() {
CVI_RT_HANDLE ctx;
cvk_context_t *bmk;
int round_mode;
round_mode = set_store_feround();
test_init(&ctx, &bmk);
for (int i = GEN_POW_20_DATA_MAX_ERROR; i < TEST_MODE_MAX; i++) {
mode = static_cast<TEST_MODE>(i);
printf("test mode %d...\n", mode);
int input_n = 1;
int input_c = 32;
int input_h = 1;
int input_w = 1;
if (mode == PRE_DATA_COMPARE_FIX) {
input_h = 4;
input_w = 8;
} else {
input_h = input_w = 16;
}
testbench(&ctx, bmk, input_n, input_c, input_h, input_w);
}
test_exit(&ctx, bmk);
restore_feround(round_mode);
return 0;
}

View File

@ -0,0 +1,907 @@
//* TODO: you could rerange any value to -127~127
#include <cvimath_internal.h>
#include <test_cvikernel_util.h>
#define OUT
#define IN
//#define DBG
/**
* pre_data means we test fixed pattern, it should be same sa lut
* compare fix means we MAKE SURE output values equal with golden,
* comment it for check with error using `MAX_ERROR`
*/
enum TEST_MODE {
PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare
PRE_DATA_MAX_ERROR, // pre-data + compare only diff < MAX_ERROR
GEN_DATA_MAX_ERROR, // gen data + compare only diff < MAX_ERROR
TEST_MODE_MAX,
};
static TEST_MODE mode;
#define MAX_ERROR (0.004)
using namespace std;
static uint16_t test_pattern[] = {
0x0000, 0x3C03, 0x3C83, 0x3CC5, 0x3D03, 0x3D24, 0x3D45, 0x3D65, 0x3D83, 0x3D93, 0x3DA4, 0x3DB4,
0x3DC5, 0x3DD5, 0x3DE5, 0x3DF6, 0x3E03, 0x3E0B, 0x3E13, 0x3E1C, 0x3E24, 0x3E2C, 0x3E34, 0x3E3C,
0x3E45, 0x3E4D, 0x3E55, 0x3E5D, 0x3E65, 0x3E6E, 0x3E76, 0x3E7E, 0x3E83, 0x3E87, 0x3E8B, 0x3E8F,
0x3E93, 0x3E98, 0x3E9C, 0x3EA0, 0x3EA4, 0x3EA8, 0x3EAC, 0x3EB0, 0x3EB4, 0x3EB8, 0x3EBC, 0x3EC1,
0x3EC5, 0x3EC9, 0x3ECD, 0x3ED1, 0x3ED5, 0x3ED9, 0x3EDD, 0x3EE1, 0x3EE5, 0x3EE9, 0x3EEE, 0x3EF2,
0x3EF6, 0x3EFA, 0x3EFE, 0x3F01, 0x3F03, 0x3F05, 0x3F07, 0x3F09, 0x3F0B, 0x3F0D, 0x3F0F, 0x3F11,
0x3F13, 0x3F16, 0x3F18, 0x3F1A, 0x3F1C, 0x3F1E, 0x3F20, 0x3F22, 0x3F24, 0x3F26, 0x3F28, 0x3F2A,
0x3F2C, 0x3F2E, 0x3F30, 0x3F32, 0x3F34, 0x3F36, 0x3F38, 0x3F3A, 0x3F3C, 0x3F3E, 0x3F41, 0x3F43,
0x3F45, 0x3F47, 0x3F49, 0x3F4B, 0x3F4D, 0x3F4F, 0x3F51, 0x3F53, 0x3F55, 0x3F57, 0x3F59, 0x3F5B,
0x3F5D, 0x3F5F, 0x3F61, 0x3F63, 0x3F65, 0x3F67, 0x3F69, 0x3F6C, 0x3F6E, 0x3F70, 0x3F72, 0x3F74,
0x3F76, 0x3F78, 0x3F7A, 0x3F7C, 0x3F7E, 0x3F80, 0x3F81, 0x3F82, 0x3F83, 0x3F84, 0x3F85, 0x3F86,
0x3F87, 0x3F88, 0x3F89, 0x3F8A, 0x3F8B, 0x3F8C, 0x3F8D, 0x3F8E, 0x3F8F, 0x3F90, 0x3F91, 0x3F92,
0x3F93, 0x3F94, 0x3F96, 0x3F97, 0x3F98, 0x3F99, 0x3F9A, 0x3F9B, 0x3F9C, 0x3F9D, 0x3F9E, 0x3F9F,
0x3FA0, 0x3FA1, 0x3FA2, 0x3FA3, 0x3FA4, 0x3FA5, 0x3FA6, 0x3FA7, 0x3FA8, 0x3FA9, 0x3FAA, 0x3FAB,
0x3FAC, 0x3FAD, 0x3FAE, 0x3FAF, 0x3FB0, 0x3FB1, 0x3FB2, 0x3FB3, 0x3FB4, 0x3FB5, 0x3FB6, 0x3FB7,
0x3FB8, 0x3FB9, 0x3FBA, 0x3FBB, 0x3FBC, 0x3FBD, 0x3FBE, 0x3FBF, 0x3FC1, 0x3FC2, 0x3FC3, 0x3FC4,
0x3FC5, 0x3FC6, 0x3FC7, 0x3FC8, 0x3FC9, 0x3FCA, 0x3FCB, 0x3FCC, 0x3FCD, 0x3FCE, 0x3FCF, 0x3FD0,
0x3FD1, 0x3FD2, 0x3FD3, 0x3FD4, 0x3FD5, 0x3FD6, 0x3FD7, 0x3FD8, 0x3FD9, 0x3FDA, 0x3FDB, 0x3FDC,
0x3FDD, 0x3FDE, 0x3FDF, 0x3FE0, 0x3FE1, 0x3FE2, 0x3FE3, 0x3FE4, 0x3FE5, 0x3FE6, 0x3FE7, 0x3FE8,
0x3FE9, 0x3FEA, 0x3FEC, 0x3FED, 0x3FEE, 0x3FEF, 0x3FF0, 0x3FF1, 0x3FF2, 0x3FF3, 0x3FF4, 0x3FF5,
0x3FF6, 0x3FF7, 0x3FF8, 0x3FF9, 0x3FFA, 0x3FFB, 0x3FFC, 0x3FFD, 0x3FFE, 0x3FFF, 0x4000, 0x4001,
0x4001, 0x4002, 0x4002, 0x4003, 0x4003, 0x4004, 0x4004, 0x4005, 0x4005, 0x4006, 0x4006, 0x4007,
0x4007, 0x4008, 0x4008, 0x4009, 0x4009, 0x400A, 0x400A, 0x400B, 0x400B, 0x400C, 0x400C, 0x400D,
0x400D, 0x400E, 0x400E, 0x400F, 0x400F, 0x4010, 0x4010, 0x4011, 0x4011, 0x4012, 0x4012, 0x4013,
0x4013, 0x4014, 0x4014, 0x4015, 0x4016, 0x4016, 0x4017, 0x4017, 0x4018, 0x4018, 0x4019, 0x4019,
0x401A, 0x401A, 0x401B, 0x401B, 0x401C, 0x401C, 0x401D, 0x401D, 0x401E, 0x401E, 0x401F, 0x401F,
0x4020, 0x4020, 0x4021, 0x4021, 0x4022, 0x4022, 0x4023, 0x4023, 0x4024, 0x4024, 0x4025, 0x4025,
0x4026, 0x4026, 0x4027, 0x4027, 0x4028, 0x4028, 0x4029, 0x4029, 0x402A, 0x402A, 0x402B, 0x402C,
0x402C, 0x402D, 0x402D, 0x402E, 0x402E, 0x402F, 0x402F, 0x4030, 0x4030, 0x4031, 0x4031, 0x4032,
0x4032, 0x4033, 0x4033, 0x4034, 0x4034, 0x4035, 0x4035, 0x4036, 0x4036, 0x4037, 0x4037, 0x4038,
0x4038, 0x4039, 0x4039, 0x403A, 0x403A, 0x403B, 0x403B, 0x403C, 0x403C, 0x403D, 0x403D, 0x403E,
0x403E, 0x403F, 0x403F, 0x4040, 0x4041, 0x4041, 0x4042, 0x4042, 0x4043, 0x4043, 0x4044, 0x4044,
0x4045, 0x4045, 0x4046, 0x4046, 0x4047, 0x4047, 0x4048, 0x4048, 0x4049, 0x4049, 0x404A, 0x404A,
0x404B, 0x404B, 0x404C, 0x404C, 0x404D, 0x404D, 0x404E, 0x404E, 0x404F, 0x404F, 0x4050, 0x4050,
0x4051, 0x4051, 0x4052, 0x4052, 0x4053, 0x4053, 0x4054, 0x4054, 0x4055, 0x4056, 0x4056, 0x4057,
0x4057, 0x4058, 0x4058, 0x4059, 0x4059, 0x405A, 0x405A, 0x405B, 0x405B, 0x405C, 0x405C, 0x405D,
0x405D, 0x405E, 0x405E, 0x405F, 0x405F, 0x4060, 0x4060, 0x4061, 0x4061, 0x4062, 0x4062, 0x4063,
0x4063, 0x4064, 0x4064, 0x4065, 0x4065, 0x4066, 0x4066, 0x4067, 0x4067, 0x4068, 0x4068, 0x4069,
0x4069, 0x406A, 0x406A, 0x406B, 0x406C, 0x406C, 0x406D, 0x406D, 0x406E, 0x406E, 0x406F, 0x406F,
0x4070, 0x4070, 0x4071, 0x4071, 0x4072, 0x4072, 0x4073, 0x4073, 0x4074, 0x4074, 0x4075, 0x4075,
0x4076, 0x4076, 0x4077, 0x4077, 0x4078, 0x4078, 0x4079, 0x4079, 0x407A, 0x407A, 0x407B, 0x407B,
0x407C, 0x407C, 0x407D, 0x407D, 0x407E, 0x407E, 0x407F, 0x407F, 0x4080, 0x4080, 0x4081, 0x4081,
0x4081, 0x4081, 0x4082, 0x4082, 0x4082, 0x4082, 0x4083, 0x4083, 0x4083, 0x4083, 0x4084, 0x4084,
0x4084, 0x4084, 0x4085, 0x4085, 0x4085, 0x4085, 0x4086, 0x4086, 0x4086, 0x4086, 0x4087, 0x4087,
0x4087, 0x4087, 0x4088, 0x4088, 0x4088, 0x4088, 0x4089, 0x4089, 0x4089, 0x4089, 0x408A, 0x408A,
0x408A, 0x408A, 0x408B, 0x408B, 0x408B, 0x408C, 0x408C, 0x408C, 0x408C, 0x408D, 0x408D, 0x408D,
0x408D, 0x408E, 0x408E, 0x408E, 0x408E, 0x408F, 0x408F, 0x408F, 0x408F, 0x4090, 0x4090, 0x4090,
0x4090, 0x4091, 0x4091, 0x4091, 0x4091, 0x4092, 0x4092, 0x4092, 0x4092, 0x4093, 0x4093, 0x4093,
0x4093, 0x4094, 0x4094, 0x4094, 0x4094, 0x4095, 0x4095, 0x4095, 0x4096, 0x4096, 0x4096, 0x4096,
0x4097, 0x4097, 0x4097, 0x4097, 0x4098, 0x4098, 0x4098, 0x4098, 0x4099, 0x4099, 0x4099, 0x4099,
0x409A, 0x409A, 0x409A, 0x409A, 0x409B, 0x409B, 0x409B, 0x409B, 0x409C, 0x409C, 0x409C, 0x409C,
0x409D, 0x409D, 0x409D, 0x409D, 0x409E, 0x409E, 0x409E, 0x409E, 0x409F, 0x409F, 0x409F, 0x409F,
0x40A0, 0x40A0, 0x40A0, 0x40A1, 0x40A1, 0x40A1, 0x40A1, 0x40A2, 0x40A2, 0x40A2, 0x40A2, 0x40A3,
0x40A3, 0x40A3, 0x40A3, 0x40A4, 0x40A4, 0x40A4, 0x40A4, 0x40A5, 0x40A5, 0x40A5, 0x40A5, 0x40A6,
0x40A6, 0x40A6, 0x40A6, 0x40A7, 0x40A7, 0x40A7, 0x40A7, 0x40A8, 0x40A8, 0x40A8, 0x40A8, 0x40A9,
0x40A9, 0x40A9, 0x40A9, 0x40AA, 0x40AA, 0x40AA, 0x40AA, 0x40AB, 0x40AB, 0x40AB, 0x40AC, 0x40AC,
0x40AC, 0x40AC, 0x40AD, 0x40AD, 0x40AD, 0x40AD, 0x40AE, 0x40AE, 0x40AE, 0x40AE, 0x40AF, 0x40AF,
0x40AF, 0x40AF, 0x40B0, 0x40B0, 0x40B0, 0x40B0, 0x40B1, 0x40B1, 0x40B1, 0x40B1, 0x40B2, 0x40B2,
0x40B2, 0x40B2, 0x40B3, 0x40B3, 0x40B3, 0x40B3, 0x40B4, 0x40B4, 0x40B4, 0x40B4, 0x40B5, 0x40B5,
0x40B5, 0x40B6, 0x40B6, 0x40B6, 0x40B6, 0x40B7, 0x40B7, 0x40B7, 0x40B7, 0x40B8, 0x40B8, 0x40B8,
0x40B8, 0x40B9, 0x40B9, 0x40B9, 0x40B9, 0x40BA, 0x40BA, 0x40BA, 0x40BA, 0x40BB, 0x40BB, 0x40BB,
0x40BB, 0x40BC, 0x40BC, 0x40BC, 0x40BC, 0x40BD, 0x40BD, 0x40BD, 0x40BD, 0x40BE, 0x40BE, 0x40BE,
0x40BE, 0x40BF, 0x40BF, 0x40BF, 0x40BF, 0x40C0, 0x40C0, 0x40C0, 0x40C1, 0x40C1, 0x40C1, 0x40C1,
0x40C2, 0x40C2, 0x40C2, 0x40C2, 0x40C3, 0x40C3, 0x40C3, 0x40C3, 0x40C4, 0x40C4, 0x40C4, 0x40C4,
0x40C5, 0x40C5, 0x40C5, 0x40C5, 0x40C6, 0x40C6, 0x40C6, 0x40C6, 0x40C7, 0x40C7, 0x40C7, 0x40C7,
0x40C8, 0x40C8, 0x40C8, 0x40C8, 0x40C9, 0x40C9, 0x40C9, 0x40C9, 0x40CA, 0x40CA, 0x40CA, 0x40CA,
0x40CB, 0x40CB, 0x40CB, 0x40CC, 0x40CC, 0x40CC, 0x40CC, 0x40CD, 0x40CD, 0x40CD, 0x40CD, 0x40CE,
0x40CE, 0x40CE, 0x40CE, 0x40CF, 0x40CF, 0x40CF, 0x40CF, 0x40D0, 0x40D0, 0x40D0, 0x40D0, 0x40D1,
0x40D1, 0x40D1, 0x40D1, 0x40D2, 0x40D2, 0x40D2, 0x40D2, 0x40D3, 0x40D3, 0x40D3, 0x40D3, 0x40D4,
0x40D4, 0x40D4, 0x40D4, 0x40D5, 0x40D5, 0x40D5, 0x40D6, 0x40D6, 0x40D6, 0x40D6, 0x40D7, 0x40D7,
0x40D7, 0x40D7, 0x40D8, 0x40D8, 0x40D8, 0x40D8, 0x40D9, 0x40D9, 0x40D9, 0x40D9, 0x40DA, 0x40DA,
0x40DA, 0x40DA, 0x40DB, 0x40DB, 0x40DB, 0x40DB, 0x40DC, 0x40DC, 0x40DC, 0x40DC, 0x40DD, 0x40DD,
0x40DD, 0x40DD, 0x40DE, 0x40DE, 0x40DE, 0x40DE, 0x40DF, 0x40DF, 0x40DF, 0x40DF, 0x40E0, 0x40E0,
0x40E0, 0x40E1, 0x40E1, 0x40E1, 0x40E1, 0x40E2, 0x40E2, 0x40E2, 0x40E2, 0x40E3, 0x40E3, 0x40E3,
0x40E3, 0x40E4, 0x40E4, 0x40E4, 0x40E4, 0x40E5, 0x40E5, 0x40E5, 0x40E5, 0x40E6, 0x40E6, 0x40E6,
0x40E6, 0x40E7, 0x40E7, 0x40E7, 0x40E7, 0x40E8, 0x40E8, 0x40E8, 0x40E8, 0x40E9, 0x40E9, 0x40E9,
0x40E9, 0x40EA, 0x40EA, 0x40EA, 0x40EA, 0x40EB, 0x40EB, 0x40EB, 0x40EC, 0x40EC, 0x40EC, 0x40EC,
0x40ED, 0x40ED, 0x40ED, 0x40ED, 0x40EE, 0x40EE, 0x40EE, 0x40EE, 0x40EF, 0x40EF, 0x40EF, 0x40EF,
0x40F0, 0x40F0, 0x40F0, 0x40F0, 0x40F1, 0x40F1, 0x40F1, 0x40F1, 0x40F2, 0x40F2, 0x40F2, 0x40F2,
0x40F3, 0x40F3, 0x40F3, 0x40F3, 0x40F4, 0x40F4, 0x40F4, 0x40F4, 0x40F5, 0x40F5, 0x40F5, 0x40F6,
0x40F6, 0x40F6, 0x40F6, 0x40F7, 0x40F7, 0x40F7, 0x40F7, 0x40F8, 0x40F8, 0x40F8, 0x40F8, 0x40F9,
0x40F9, 0x40F9, 0x40F9, 0x40FA, 0x40FA, 0x40FA, 0x40FA, 0x40FB, 0x40FB, 0x40FB, 0x40FB, 0x40FC,
0x40FC, 0x40FC, 0x40FC, 0x40FD, 0x40FD, 0x40FD, 0x40FD, 0x40FE, 0x40FE, 0x40FE, 0x40FE, 0x40FF,
0x40FF, 0x40FF, 0x40FF, 0x4100, 0xBC03, 0xBC83, 0xBCC5, 0xBD03, 0xBD24, 0xBD45, 0xBD65, 0xBD83,
0xBD93, 0xBDA4, 0xBDB4, 0xBDC5, 0xBDD5, 0xBDE5, 0xBDF6, 0xBE03, 0xBE0B, 0xBE13, 0xBE1C, 0xBE24,
0xBE2C, 0xBE34, 0xBE3C, 0xBE45, 0xBE4D, 0xBE55, 0xBE5D, 0xBE65, 0xBE6E, 0xBE76, 0xBE7E, 0xBE83,
0xBE87, 0xBE8B, 0xBE8F, 0xBE93, 0xBE98, 0xBE9C, 0xBEA0, 0xBEA4, 0xBEA8, 0xBEAC, 0xBEB0, 0xBEB4,
0xBEB8, 0xBEBC, 0xBEC1, 0xBEC5, 0xBEC9, 0xBECD, 0xBED1, 0xBED5, 0xBED9, 0xBEDD, 0xBEE1, 0xBEE5,
0xBEE9, 0xBEEE, 0xBEF2, 0xBEF6, 0xBEFA, 0xBEFE, 0xBF01, 0xBF03, 0xBF05, 0xBF07, 0xBF09, 0xBF0B,
0xBF0D, 0xBF0F, 0xBF11, 0xBF13, 0xBF16, 0xBF18, 0xBF1A, 0xBF1C, 0xBF1E, 0xBF20, 0xBF22, 0xBF24,
0xBF26, 0xBF28, 0xBF2A, 0xBF2C, 0xBF2E, 0xBF30, 0xBF32, 0xBF34, 0xBF36, 0xBF38, 0xBF3A, 0xBF3C,
0xBF3E, 0xBF41, 0xBF43, 0xBF45, 0xBF47, 0xBF49, 0xBF4B, 0xBF4D, 0xBF4F, 0xBF51, 0xBF53, 0xBF55,
0xBF57, 0xBF59, 0xBF5B, 0xBF5D, 0xBF5F, 0xBF61, 0xBF63, 0xBF65, 0xBF67, 0xBF69, 0xBF6C, 0xBF6E,
0xBF70, 0xBF72, 0xBF74, 0xBF76, 0xBF78, 0xBF7A, 0xBF7C, 0xBF7E, 0xBF80, 0xBF81, 0xBF82, 0xBF83,
0xBF84, 0xBF85, 0xBF86, 0xBF87, 0xBF88, 0xBF89, 0xBF8A, 0xBF8B, 0xBF8C, 0xBF8D, 0xBF8E, 0xBF8F,
0xBF90, 0xBF91, 0xBF92, 0xBF93, 0xBF94, 0xBF96, 0xBF97, 0xBF98, 0xBF99, 0xBF9A, 0xBF9B, 0xBF9C,
0xBF9D, 0xBF9E, 0xBF9F, 0xBFA0, 0xBFA1, 0xBFA2, 0xBFA3, 0xBFA4, 0xBFA5, 0xBFA6, 0xBFA7, 0xBFA8,
0xBFA9, 0xBFAA, 0xBFAB, 0xBFAC, 0xBFAD, 0xBFAE, 0xBFAF, 0xBFB0, 0xBFB1, 0xBFB2, 0xBFB3, 0xBFB4,
0xBFB5, 0xBFB6, 0xBFB7, 0xBFB8, 0xBFB9, 0xBFBA, 0xBFBB, 0xBFBC, 0xBFBD, 0xBFBE, 0xBFBF, 0xBFC1,
0xBFC2, 0xBFC3, 0xBFC4, 0xBFC5, 0xBFC6, 0xBFC7, 0xBFC8, 0xBFC9, 0xBFCA, 0xBFCB, 0xBFCC, 0xBFCD,
0xBFCE, 0xBFCF, 0xBFD0, 0xBFD1, 0xBFD2, 0xBFD3, 0xBFD4, 0xBFD5, 0xBFD6, 0xBFD7, 0xBFD8, 0xBFD9,
0xBFDA, 0xBFDB, 0xBFDC, 0xBFDD, 0xBFDE, 0xBFDF, 0xBFE0, 0xBFE1, 0xBFE2, 0xBFE3, 0xBFE4, 0xBFE5,
0xBFE6, 0xBFE7, 0xBFE8, 0xBFE9, 0xBFEA, 0xBFEC, 0xBFED, 0xBFEE, 0xBFEF, 0xBFF0, 0xBFF1, 0xBFF2,
0xBFF3, 0xBFF4, 0xBFF5, 0xBFF6, 0xBFF7, 0xBFF8, 0xBFF9, 0xBFFA, 0xBFFB, 0xBFFC, 0xBFFD, 0xBFFE,
0xBFFF, 0xC000, 0xC001, 0xC001, 0xC002, 0xC002, 0xC003, 0xC003, 0xC004, 0xC004, 0xC005, 0xC005,
0xC006, 0xC006, 0xC007, 0xC007, 0xC008, 0xC008, 0xC009, 0xC009, 0xC00A, 0xC00A, 0xC00B, 0xC00B,
0xC00C, 0xC00C, 0xC00D, 0xC00D, 0xC00E, 0xC00E, 0xC00F, 0xC00F, 0xC010, 0xC010, 0xC011, 0xC011,
0xC012, 0xC012, 0xC013, 0xC013, 0xC014, 0xC014, 0xC015, 0xC016, 0xC016, 0xC017, 0xC017, 0xC018,
0xC018, 0xC019, 0xC019, 0xC01A, 0xC01A, 0xC01B, 0xC01B, 0xC01C, 0xC01C, 0xC01D, 0xC01D, 0xC01E,
0xC01E, 0xC01F, 0xC01F, 0xC020, 0xC020, 0xC021, 0xC021, 0xC022, 0xC022, 0xC023, 0xC023, 0xC024,
0xC024, 0xC025, 0xC025, 0xC026, 0xC026, 0xC027, 0xC027, 0xC028, 0xC028, 0xC029, 0xC029, 0xC02A,
0xC02A, 0xC02B, 0xC02C, 0xC02C, 0xC02D, 0xC02D, 0xC02E, 0xC02E, 0xC02F, 0xC02F, 0xC030, 0xC030,
0xC031, 0xC031, 0xC032, 0xC032, 0xC033, 0xC033, 0xC034, 0xC034, 0xC035, 0xC035, 0xC036, 0xC036,
0xC037, 0xC037, 0xC038, 0xC038, 0xC039, 0xC039, 0xC03A, 0xC03A, 0xC03B, 0xC03B, 0xC03C, 0xC03C,
0xC03D, 0xC03D, 0xC03E, 0xC03E, 0xC03F, 0xC03F, 0xC040, 0xC041, 0xC041, 0xC042, 0xC042, 0xC043,
0xC043, 0xC044, 0xC044, 0xC045, 0xC045, 0xC046, 0xC046, 0xC047, 0xC047, 0xC048, 0xC048, 0xC049,
0xC049, 0xC04A, 0xC04A, 0xC04B, 0xC04B, 0xC04C, 0xC04C, 0xC04D, 0xC04D, 0xC04E, 0xC04E, 0xC04F,
0xC04F, 0xC050, 0xC050, 0xC051, 0xC051, 0xC052, 0xC052, 0xC053, 0xC053, 0xC054, 0xC054, 0xC055,
0xC056, 0xC056, 0xC057, 0xC057, 0xC058, 0xC058, 0xC059, 0xC059, 0xC05A, 0xC05A, 0xC05B, 0xC05B,
0xC05C, 0xC05C, 0xC05D, 0xC05D, 0xC05E, 0xC05E, 0xC05F, 0xC05F, 0xC060, 0xC060, 0xC061, 0xC061,
0xC062, 0xC062, 0xC063, 0xC063, 0xC064, 0xC064, 0xC065, 0xC065, 0xC066, 0xC066, 0xC067, 0xC067,
0xC068, 0xC068, 0xC069, 0xC069, 0xC06A, 0xC06A, 0xC06B, 0xC06C, 0xC06C, 0xC06D, 0xC06D, 0xC06E,
0xC06E, 0xC06F, 0xC06F, 0xC070, 0xC070, 0xC071, 0xC071, 0xC072, 0xC072, 0xC073, 0xC073, 0xC074,
0xC074, 0xC075, 0xC075, 0xC076, 0xC076, 0xC077, 0xC077, 0xC078, 0xC078, 0xC079, 0xC079, 0xC07A,
0xC07A, 0xC07B, 0xC07B, 0xC07C, 0xC07C, 0xC07D, 0xC07D, 0xC07E, 0xC07E, 0xC07F, 0xC07F, 0xC080,
0xC080, 0xC081, 0xC081, 0xC081, 0xC081, 0xC082, 0xC082, 0xC082, 0xC082, 0xC083, 0xC083, 0xC083,
0xC083, 0xC084, 0xC084, 0xC084, 0xC084, 0xC085, 0xC085, 0xC085, 0xC085, 0xC086, 0xC086, 0xC086,
0xC086, 0xC087, 0xC087, 0xC087, 0xC087, 0xC088, 0xC088, 0xC088, 0xC088, 0xC089, 0xC089, 0xC089,
0xC089, 0xC08A, 0xC08A, 0xC08A, 0xC08A, 0xC08B, 0xC08B, 0xC08B, 0xC08C, 0xC08C, 0xC08C, 0xC08C,
0xC08D, 0xC08D, 0xC08D, 0xC08D, 0xC08E, 0xC08E, 0xC08E, 0xC08E, 0xC08F, 0xC08F, 0xC08F, 0xC08F,
0xC090, 0xC090, 0xC090, 0xC090, 0xC091, 0xC091, 0xC091, 0xC091, 0xC092, 0xC092, 0xC092, 0xC092,
0xC093, 0xC093, 0xC093, 0xC093, 0xC094, 0xC094, 0xC094, 0xC094, 0xC095, 0xC095, 0xC095, 0xC096,
0xC096, 0xC096, 0xC096, 0xC097, 0xC097, 0xC097, 0xC097, 0xC098, 0xC098, 0xC098, 0xC098, 0xC099,
0xC099, 0xC099, 0xC099, 0xC09A, 0xC09A, 0xC09A, 0xC09A, 0xC09B, 0xC09B, 0xC09B, 0xC09B, 0xC09C,
0xC09C, 0xC09C, 0xC09C, 0xC09D, 0xC09D, 0xC09D, 0xC09D, 0xC09E, 0xC09E, 0xC09E, 0xC09E, 0xC09F,
0xC09F, 0xC09F, 0xC09F, 0xC0A0, 0xC0A0, 0xC0A0, 0xC0A1, 0xC0A1, 0xC0A1, 0xC0A1, 0xC0A2, 0xC0A2,
0xC0A2, 0xC0A2, 0xC0A3, 0xC0A3, 0xC0A3, 0xC0A3, 0xC0A4, 0xC0A4, 0xC0A4, 0xC0A4, 0xC0A5, 0xC0A5,
0xC0A5, 0xC0A5, 0xC0A6, 0xC0A6, 0xC0A6, 0xC0A6, 0xC0A7, 0xC0A7, 0xC0A7, 0xC0A7, 0xC0A8, 0xC0A8,
0xC0A8, 0xC0A8, 0xC0A9, 0xC0A9, 0xC0A9, 0xC0A9, 0xC0AA, 0xC0AA, 0xC0AA, 0xC0AA, 0xC0AB, 0xC0AB,
0xC0AB, 0xC0AC, 0xC0AC, 0xC0AC, 0xC0AC, 0xC0AD, 0xC0AD, 0xC0AD, 0xC0AD, 0xC0AE, 0xC0AE, 0xC0AE,
0xC0AE, 0xC0AF, 0xC0AF, 0xC0AF, 0xC0AF, 0xC0B0, 0xC0B0, 0xC0B0, 0xC0B0, 0xC0B1, 0xC0B1, 0xC0B1,
0xC0B1, 0xC0B2, 0xC0B2, 0xC0B2, 0xC0B2, 0xC0B3, 0xC0B3, 0xC0B3, 0xC0B3, 0xC0B4, 0xC0B4, 0xC0B4,
0xC0B4, 0xC0B5, 0xC0B5, 0xC0B5, 0xC0B6, 0xC0B6, 0xC0B6, 0xC0B6, 0xC0B7, 0xC0B7, 0xC0B7, 0xC0B7,
0xC0B8, 0xC0B8, 0xC0B8, 0xC0B8, 0xC0B9, 0xC0B9, 0xC0B9, 0xC0B9, 0xC0BA, 0xC0BA, 0xC0BA, 0xC0BA,
0xC0BB, 0xC0BB, 0xC0BB, 0xC0BB, 0xC0BC, 0xC0BC, 0xC0BC, 0xC0BC, 0xC0BD, 0xC0BD, 0xC0BD, 0xC0BD,
0xC0BE, 0xC0BE, 0xC0BE, 0xC0BE, 0xC0BF, 0xC0BF, 0xC0BF, 0xC0BF, 0xC0C0, 0xC0C0, 0xC0C0, 0xC0C1,
0xC0C1, 0xC0C1, 0xC0C1, 0xC0C2, 0xC0C2, 0xC0C2, 0xC0C2, 0xC0C3, 0xC0C3, 0xC0C3, 0xC0C3, 0xC0C4,
0xC0C4, 0xC0C4, 0xC0C4, 0xC0C5, 0xC0C5, 0xC0C5, 0xC0C5, 0xC0C6, 0xC0C6, 0xC0C6, 0xC0C6, 0xC0C7,
0xC0C7, 0xC0C7, 0xC0C7, 0xC0C8, 0xC0C8, 0xC0C8, 0xC0C8, 0xC0C9, 0xC0C9, 0xC0C9, 0xC0C9, 0xC0CA,
0xC0CA, 0xC0CA, 0xC0CA, 0xC0CB, 0xC0CB, 0xC0CB, 0xC0CC, 0xC0CC, 0xC0CC, 0xC0CC, 0xC0CD, 0xC0CD,
0xC0CD, 0xC0CD, 0xC0CE, 0xC0CE, 0xC0CE, 0xC0CE, 0xC0CF, 0xC0CF, 0xC0CF, 0xC0CF, 0xC0D0, 0xC0D0,
0xC0D0, 0xC0D0, 0xC0D1, 0xC0D1, 0xC0D1, 0xC0D1, 0xC0D2, 0xC0D2, 0xC0D2, 0xC0D2, 0xC0D3, 0xC0D3,
0xC0D3, 0xC0D3, 0xC0D4, 0xC0D4, 0xC0D4, 0xC0D4, 0xC0D5, 0xC0D5, 0xC0D5, 0xC0D6, 0xC0D6, 0xC0D6,
0xC0D6, 0xC0D7, 0xC0D7, 0xC0D7, 0xC0D7, 0xC0D8, 0xC0D8, 0xC0D8, 0xC0D8, 0xC0D9, 0xC0D9, 0xC0D9,
0xC0D9, 0xC0DA, 0xC0DA, 0xC0DA, 0xC0DA, 0xC0DB, 0xC0DB, 0xC0DB, 0xC0DB, 0xC0DC, 0xC0DC, 0xC0DC,
0xC0DC, 0xC0DD, 0xC0DD, 0xC0DD, 0xC0DD, 0xC0DE, 0xC0DE, 0xC0DE, 0xC0DE, 0xC0DF, 0xC0DF, 0xC0DF,
0xC0DF, 0xC0E0, 0xC0E0, 0xC0E0, 0xC0E1, 0xC0E1, 0xC0E1, 0xC0E1, 0xC0E2, 0xC0E2, 0xC0E2, 0xC0E2,
0xC0E3, 0xC0E3, 0xC0E3, 0xC0E3, 0xC0E4, 0xC0E4, 0xC0E4, 0xC0E4, 0xC0E5, 0xC0E5, 0xC0E5, 0xC0E5,
0xC0E6, 0xC0E6, 0xC0E6, 0xC0E6, 0xC0E7, 0xC0E7, 0xC0E7, 0xC0E7, 0xC0E8, 0xC0E8, 0xC0E8, 0xC0E8,
0xC0E9, 0xC0E9, 0xC0E9, 0xC0E9, 0xC0EA, 0xC0EA, 0xC0EA, 0xC0EA, 0xC0EB, 0xC0EB, 0xC0EB, 0xC0EC,
0xC0EC, 0xC0EC, 0xC0EC, 0xC0ED, 0xC0ED, 0xC0ED, 0xC0ED, 0xC0EE, 0xC0EE, 0xC0EE, 0xC0EE, 0xC0EF,
0xC0EF, 0xC0EF, 0xC0EF, 0xC0F0, 0xC0F0, 0xC0F0, 0xC0F0, 0xC0F1, 0xC0F1, 0xC0F1, 0xC0F1, 0xC0F2,
0xC0F2, 0xC0F2, 0xC0F2, 0xC0F3, 0xC0F3, 0xC0F3, 0xC0F3, 0xC0F4, 0xC0F4, 0xC0F4, 0xC0F4, 0xC0F5,
0xC0F5, 0xC0F5, 0xC0F6, 0xC0F6, 0xC0F6, 0xC0F6, 0xC0F7, 0xC0F7, 0xC0F7, 0xC0F7, 0xC0F8, 0xC0F8,
0xC0F8, 0xC0F8, 0xC0F9, 0xC0F9, 0xC0F9, 0xC0F9, 0xC0FA, 0xC0FA, 0xC0FA, 0xC0FA, 0xC0FB, 0xC0FB,
0xC0FB, 0xC0FB, 0xC0FC, 0xC0FC, 0xC0FC, 0xC0FC, 0xC0FD, 0xC0FD, 0xC0FD, 0xC0FD, 0xC0FE, 0xC0FE,
0xC0FE, 0xC0FE, 0xC0FF, 0xC0FF, 0xC0FF, 0xC0FF, 0xC100, 0xC100,
};
static uint16_t sigmode_golden_bf16[] = {
0x3f00, 0x3f01, 0x3f01, 0x3f02, 0x3f02, 0x3f03, 0x3f03, 0x3f04, 0x3f04, 0x3f05, 0x3f05, 0x3f06,
0x3f06, 0x3f07, 0x3f07, 0x3f08, 0x3f08, 0x3f09, 0x3f09, 0x3f0a, 0x3f0a, 0x3f0b, 0x3f0b, 0x3f0c,
0x3f0c, 0x3f0d, 0x3f0d, 0x3f0e, 0x3f0e, 0x3f0f, 0x3f0f, 0x3f10, 0x3f10, 0x3f11, 0x3f11, 0x3f12,
0x3f12, 0x3f13, 0x3f13, 0x3f14, 0x3f14, 0x3f15, 0x3f15, 0x3f16, 0x3f16, 0x3f17, 0x3f17, 0x3f18,
0x3f19, 0x3f19, 0x3f1a, 0x3f1a, 0x3f1b, 0x3f1b, 0x3f1b, 0x3f1c, 0x3f1d, 0x3f1d, 0x3f1e, 0x3f1e,
0x3f1f, 0x3f1f, 0x3f20, 0x3f1f, 0x3f20, 0x3f20, 0x3f21, 0x3f21, 0x3f22, 0x3f22, 0x3f23, 0x3f23,
0x3f24, 0x3f24, 0x3f25, 0x3f25, 0x3f26, 0x3f26, 0x3f27, 0x3f27, 0x3f28, 0x3f28, 0x3f29, 0x3f29,
0x3f2a, 0x3f2a, 0x3f2a, 0x3f2a, 0x3f2b, 0x3f2b, 0x3f2c, 0x3f2c, 0x3f2d, 0x3f2d, 0x3f2e, 0x3f2f,
0x3f2f, 0x3f30, 0x3f30, 0x3f30, 0x3f31, 0x3f31, 0x3f31, 0x3f32, 0x3f32, 0x3f32, 0x3f33, 0x3f33,
0x3f34, 0x3f34, 0x3f35, 0x3f36, 0x3f36, 0x3f36, 0x3f37, 0x3f37, 0x3f38, 0x3f38, 0x3f38, 0x3f39,
0x3f39, 0x3f3a, 0x3f3a, 0x3f3a, 0x3f3b, 0x3f3b, 0x3f3b, 0x3f3c, 0x3f3c, 0x3f3d, 0x3f3d, 0x3f3d,
0x3f3e, 0x3f3e, 0x3f3e, 0x3f3f, 0x3f3f, 0x3f40, 0x3f40, 0x3f40, 0x3f41, 0x3f41, 0x3f41, 0x3f42,
0x3f42, 0x3f42, 0x3f43, 0x3f44, 0x3f44, 0x3f44, 0x3f45, 0x3f45, 0x3f45, 0x3f46, 0x3f46, 0x3f46,
0x3f47, 0x3f47, 0x3f48, 0x3f48, 0x3f48, 0x3f49, 0x3f49, 0x3f49, 0x3f4a, 0x3f4a, 0x3f4b, 0x3f4b,
0x3f4b, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4d, 0x3f4d, 0x3f4d, 0x3f4e, 0x3f4e, 0x3f4e,
0x3f4f, 0x3f4f, 0x3f50, 0x3f50, 0x3f50, 0x3f51, 0x3f51, 0x3f51, 0x3f51, 0x3f52, 0x3f52, 0x3f52,
0x3f52, 0x3f53, 0x3f53, 0x3f54, 0x3f54, 0x3f55, 0x3f55, 0x3f55, 0x3f55, 0x3f56, 0x3f56, 0x3f56,
0x3f56, 0x3f57, 0x3f57, 0x3f57, 0x3f57, 0x3f58, 0x3f58, 0x3f58, 0x3f58, 0x3f59, 0x3f59, 0x3f59,
0x3f59, 0x3f5a, 0x3f5a, 0x3f5a, 0x3f5a, 0x3f5a, 0x3f5b, 0x3f5b, 0x3f5b, 0x3f5b, 0x3f5c, 0x3f5c,
0x3f5c, 0x3f5c, 0x3f5d, 0x3f5d, 0x3f5d, 0x3f5e, 0x3f5e, 0x3f5e, 0x3f5e, 0x3f5f, 0x3f5f, 0x3f5f,
0x3f5f, 0x3f60, 0x3f60, 0x3f60, 0x3f60, 0x3f61, 0x3f61, 0x3f61, 0x3f61, 0x3f62, 0x3f61, 0x3f61,
0x3f61, 0x3f62, 0x3f62, 0x3f62, 0x3f62, 0x3f63, 0x3f63, 0x3f63, 0x3f63, 0x3f64, 0x3f64, 0x3f64,
0x3f64, 0x3f65, 0x3f65, 0x3f65, 0x3f65, 0x3f66, 0x3f66, 0x3f66, 0x3f66, 0x3f66, 0x3f66, 0x3f66,
0x3f66, 0x3f67, 0x3f67, 0x3f67, 0x3f67, 0x3f68, 0x3f68, 0x3f68, 0x3f68, 0x3f69, 0x3f69, 0x3f69,
0x3f69, 0x3f69, 0x3f69, 0x3f69, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a,
0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6c, 0x3f6c, 0x3f6c, 0x3f6c,
0x3f6d, 0x3f6d, 0x3f6d, 0x3f6d, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e,
0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f70, 0x3f70,
0x3f70, 0x3f70, 0x3f70, 0x3f70, 0x3f70, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71,
0x3f71, 0x3f72, 0x3f72, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f72, 0x3f72, 0x3f72,
0x3f72, 0x3f72, 0x3f72, 0x3f72, 0x3f72, 0x3f73, 0x3f73, 0x3f73, 0x3f73, 0x3f73, 0x3f73, 0x3f73,
0x3f73, 0x3f74, 0x3f74, 0x3f74, 0x3f74, 0x3f74, 0x3f74, 0x3f74, 0x3f75, 0x3f75, 0x3f75, 0x3f75,
0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75,
0x3f75, 0x3f75, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76,
0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f77, 0x3f77, 0x3f77, 0x3f77, 0x3f77, 0x3f77,
0x3f77, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78,
0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78,
0x3f78, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79,
0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a,
0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a,
0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b,
0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b,
0x3f7b, 0x3f7b, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c,
0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c,
0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7d, 0x3f7d,
0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d,
0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d,
0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d,
0x3f7d, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7f,
0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3eff, 0x3efe, 0x3efd, 0x3efc, 0x3efb, 0x3efa, 0x3ef9, 0x3ef8,
0x3ef7, 0x3ef6, 0x3ef5, 0x3ef4, 0x3ef3, 0x3ef2, 0x3ef1, 0x3ef0, 0x3eef, 0x3eee, 0x3eed, 0x3eec,
0x3eeb, 0x3eea, 0x3ee9, 0x3ee7, 0x3ee6, 0x3ee5, 0x3ee4, 0x3ee3, 0x3ee2, 0x3ee1, 0x3ee0, 0x3edf,
0x3ede, 0x3edd, 0x3edc, 0x3edb, 0x3eda, 0x3ed9, 0x3ed8, 0x3ed7, 0x3ed6, 0x3ed5, 0x3ed4, 0x3ed3,
0x3ed2, 0x3ed1, 0x3ed1, 0x3ed0, 0x3ecf, 0x3ece, 0x3ecd, 0x3ecc, 0x3ecb, 0x3eca, 0x3ec9, 0x3ec8,
0x3ec7, 0x3ec6, 0x3ec5, 0x3ec4, 0x3ec3, 0x3ec2, 0x3ec1, 0x3ec0, 0x3ebf, 0x3ebe, 0x3ebd, 0x3ebc,
0x3ebb, 0x3eba, 0x3eba, 0x3eb9, 0x3eb7, 0x3eb6, 0x3eb5, 0x3eb4, 0x3eb4, 0x3eb3, 0x3eb2, 0x3eb1,
0x3eb0, 0x3eaf, 0x3eaf, 0x3eae, 0x3ead, 0x3eab, 0x3eaa, 0x3ea9, 0x3ea8, 0x3ea7, 0x3ea7, 0x3ea6,
0x3ea5, 0x3ea4, 0x3ea3, 0x3ea2, 0x3ea1, 0x3ea0, 0x3e9f, 0x3e9e, 0x3e9e, 0x3e9d, 0x3e9c, 0x3e9b,
0x3e9a, 0x3e99, 0x3e98, 0x3e98, 0x3e97, 0x3e97, 0x3e96, 0x3e95, 0x3e94, 0x3e93, 0x3e92, 0x3e91,
0x3e90, 0x3e8f, 0x3e8e, 0x3e8e, 0x3e8d, 0x3e8c, 0x3e8b, 0x3e8a, 0x3e8a, 0x3e89, 0x3e88, 0x3e88,
0x3e87, 0x3e86, 0x3e85, 0x3e85, 0x3e83, 0x3e82, 0x3e82, 0x3e81, 0x3e80, 0x3e7e, 0x3e7d, 0x3e7c,
0x3e7b, 0x3e7a, 0x3e78, 0x3e77, 0x3e75, 0x3e72, 0x3e71, 0x3e6f, 0x3e6e, 0x3e6c, 0x3e6b, 0x3e69,
0x3e68, 0x3e67, 0x3e65, 0x3e64, 0x3e63, 0x3e61, 0x3e60, 0x3e5f, 0x3e5d, 0x3e5c, 0x3e5a, 0x3e59,
0x3e58, 0x3e56, 0x3e55, 0x3e54, 0x3e52, 0x3e51, 0x3e50, 0x3e4f, 0x3e4e, 0x3e4c, 0x3e4b, 0x3e4a,
0x3e49, 0x3e47, 0x3e46, 0x3e45, 0x3e44, 0x3e43, 0x3e41, 0x3e40, 0x3e3f, 0x3e3e, 0x3e3c, 0x3e3a,
0x3e39, 0x3e37, 0x3e36, 0x3e35, 0x3e34, 0x3e33, 0x3e31, 0x3e30, 0x3e2f, 0x3e2e, 0x3e2c, 0x3e2b,
0x3e2a, 0x3e29, 0x3e28, 0x3e27, 0x3e26, 0x3e25, 0x3e24, 0x3e23, 0x3e22, 0x3e20, 0x3e20, 0x3e1f,
0x3e1e, 0x3e1d, 0x3e1c, 0x3e1b, 0x3e1a, 0x3e19, 0x3e18, 0x3e17, 0x3e16, 0x3e15, 0x3e14, 0x3e13,
0x3e12, 0x3e11, 0x3e10, 0x3e0f, 0x3e0e, 0x3e0c, 0x3e0b, 0x3e0a, 0x3e09, 0x3e08, 0x3e07, 0x3e06,
0x3e05, 0x3e04, 0x3e03, 0x3e03, 0x3e02, 0x3e01, 0x3e00, 0x3dff, 0x3dfd, 0x3dfb, 0x3df9, 0x3df8,
0x3df6, 0x3df4, 0x3df1, 0x3df1, 0x3ded, 0x3ded, 0x3dea, 0x3dea, 0x3de7, 0x3de7, 0x3de4, 0x3de4,
0x3de1, 0x3de1, 0x3dde, 0x3dde, 0x3ddb, 0x3ddb, 0x3dd8, 0x3dd8, 0x3dd5, 0x3dd5, 0x3dd2, 0x3dd2,
0x3dcf, 0x3dcf, 0x3dcc, 0x3dcc, 0x3dc9, 0x3dc9, 0x3dc7, 0x3dc7, 0x3dc3, 0x3dc3, 0x3dc0, 0x3dc0,
0x3dbe, 0x3dbe, 0x3dbb, 0x3dbb, 0x3db9, 0x3db9, 0x3db6, 0x3db4, 0x3db4, 0x3db1, 0x3db1, 0x3dae,
0x3dae, 0x3dac, 0x3dac, 0x3da9, 0x3da9, 0x3da7, 0x3da7, 0x3da5, 0x3da5, 0x3da3, 0x3da3, 0x3da0,
0x3da0, 0x3d9e, 0x3d9e, 0x3d9b, 0x3d9b, 0x3d99, 0x3d99, 0x3d97, 0x3d97, 0x3d94, 0x3d94, 0x3d93,
0x3d93, 0x3d91, 0x3d91, 0x3d8f, 0x3d8f, 0x3d8d, 0x3d8d, 0x3d8a, 0x3d8a, 0x3d88, 0x3d88, 0x3d86,
0x3d86, 0x3d84, 0x3d82, 0x3d82, 0x3d80, 0x3d80, 0x3d7d, 0x3d7d, 0x3d79, 0x3d79, 0x3d76, 0x3d76,
0x3d72, 0x3d72, 0x3d6f, 0x3d6f, 0x3d6b, 0x3d6b, 0x3d68, 0x3d68, 0x3d65, 0x3d65, 0x3d61, 0x3d61,
0x3d5e, 0x3d5e, 0x3d5b, 0x3d5b, 0x3d58, 0x3d58, 0x3d55, 0x3d55, 0x3d52, 0x3d52, 0x3d4e, 0x3d4e,
0x3d4b, 0x3d4b, 0x3d48, 0x3d48, 0x3d45, 0x3d45, 0x3d42, 0x3d3f, 0x3d3f, 0x3d3c, 0x3d3c, 0x3d3a,
0x3d3a, 0x3d37, 0x3d37, 0x3d34, 0x3d34, 0x3d32, 0x3d32, 0x3d2f, 0x3d2f, 0x3d2c, 0x3d2c, 0x3d2a,
0x3d2a, 0x3d27, 0x3d27, 0x3d24, 0x3d24, 0x3d22, 0x3d22, 0x3d20, 0x3d20, 0x3d1d, 0x3d1d, 0x3d1b,
0x3d1b, 0x3d19, 0x3d19, 0x3d17, 0x3d17, 0x3d15, 0x3d15, 0x3d12, 0x3d12, 0x3d10, 0x3d10, 0x3d0e,
0x3d0c, 0x3d0c, 0x3d0a, 0x3d0a, 0x3d08, 0x3d08, 0x3d06, 0x3d06, 0x3d04, 0x3d04, 0x3d02, 0x3d02,
0x3cff, 0x3cff, 0x3cfb, 0x3cfb, 0x3cf8, 0x3cf8, 0x3cf4, 0x3cf4, 0x3cf0, 0x3cf0, 0x3cec, 0x3cec,
0x3ce9, 0x3ce9, 0x3ce5, 0x3ce5, 0x3ce2, 0x3ce2, 0x3cdf, 0x3cdf, 0x3cdb, 0x3cdb, 0x3cd8, 0x3cd8,
0x3cd5, 0x3cd5, 0x3cd2, 0x3cd2, 0x3ccf, 0x3ccf, 0x3ccc, 0x3cc8, 0x3cc8, 0x3cc5, 0x3cc5, 0x3cc2,
0x3cc2, 0x3cbf, 0x3cbf, 0x3cbc, 0x3cbc, 0x3cb9, 0x3cb9, 0x3cb6, 0x3cb6, 0x3cb4, 0x3cb4, 0x3cb1,
0x3cb1, 0x3cae, 0x3cae, 0x3cac, 0x3cac, 0x3ca9, 0x3ca9, 0x3ca7, 0x3ca7, 0x3ca5, 0x3ca5, 0x3ca2,
0x3ca2, 0x3ca0, 0x3ca0, 0x3c9d, 0x3c9d, 0x3c9b, 0x3c9b, 0x3c98, 0x3c98, 0x3c96, 0x3c96, 0x3c93,
0x3c93, 0x3c8f, 0x3c8f, 0x3c8f, 0x3c8f, 0x3c8b, 0x3c8b, 0x3c8b, 0x3c8b, 0x3c87, 0x3c87, 0x3c87,
0x3c87, 0x3c82, 0x3c82, 0x3c82, 0x3c82, 0x3c7c, 0x3c7c, 0x3c7c, 0x3c7c, 0x3c75, 0x3c75, 0x3c75,
0x3c75, 0x3c6e, 0x3c6e, 0x3c6e, 0x3c6e, 0x3c66, 0x3c66, 0x3c66, 0x3c66, 0x3c5f, 0x3c5f, 0x3c5f,
0x3c5f, 0x3c59, 0x3c59, 0x3c59, 0x3c59, 0x3c53, 0x3c53, 0x3c53, 0x3c4c, 0x3c4c, 0x3c4c, 0x3c4c,
0x3c46, 0x3c46, 0x3c46, 0x3c46, 0x3c3f, 0x3c3f, 0x3c3f, 0x3c3f, 0x3c39, 0x3c39, 0x3c39, 0x3c39,
0x3c34, 0x3c34, 0x3c34, 0x3c34, 0x3c2f, 0x3c2f, 0x3c2f, 0x3c2f, 0x3c29, 0x3c29, 0x3c29, 0x3c29,
0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c1f, 0x3c1f, 0x3c1f, 0x3c1f, 0x3c1a, 0x3c1a, 0x3c1a, 0x3c16,
0x3c16, 0x3c16, 0x3c16, 0x3c12, 0x3c12, 0x3c12, 0x3c12, 0x3c0d, 0x3c0d, 0x3c0d, 0x3c0d, 0x3c09,
0x3c09, 0x3c09, 0x3c09, 0x3c04, 0x3c04, 0x3c04, 0x3c04, 0x3c00, 0x3c00, 0x3c00, 0x3c00, 0x3bf8,
0x3bf8, 0x3bf8, 0x3bf8, 0x3bf1, 0x3bf1, 0x3bf1, 0x3bf1, 0x3be9, 0x3be9, 0x3be9, 0x3be9, 0x3be2,
0x3be2, 0x3be2, 0x3be2, 0x3bdb, 0x3bdb, 0x3bdb, 0x3bd4, 0x3bd4, 0x3bd4, 0x3bd4, 0x3bce, 0x3bce,
0x3bce, 0x3bce, 0x3bc8, 0x3bc8, 0x3bc8, 0x3bc8, 0x3bc2, 0x3bc2, 0x3bc2, 0x3bc2, 0x3bbc, 0x3bbc,
0x3bbc, 0x3bbc, 0x3bb6, 0x3bb6, 0x3bb6, 0x3bb6, 0x3bb0, 0x3bb0, 0x3bb0, 0x3bb0, 0x3bab, 0x3bab,
0x3bab, 0x3bab, 0x3ba6, 0x3ba6, 0x3ba6, 0x3ba6, 0x3ba1, 0x3ba1, 0x3ba1, 0x3ba1, 0x3b9c, 0x3b9c,
0x3b9c, 0x3b97, 0x3b97, 0x3b97, 0x3b97, 0x3b92, 0x3b92, 0x3b92, 0x3b92, 0x3b8e, 0x3b8e, 0x3b8e,
0x3b8e, 0x3b8a, 0x3b8a, 0x3b8a, 0x3b8a, 0x3b85, 0x3b85, 0x3b85, 0x3b85, 0x3b81, 0x3b81, 0x3b81,
0x3b81, 0x3b7b, 0x3b7b, 0x3b7b, 0x3b7b, 0x3b73, 0x3b73, 0x3b73, 0x3b73, 0x3b6c, 0x3b6c, 0x3b6c,
0x3b6c, 0x3b65, 0x3b65, 0x3b65, 0x3b5d, 0x3b5d, 0x3b5d, 0x3b5d, 0x3b56, 0x3b56, 0x3b56, 0x3b56,
0x3b50, 0x3b50, 0x3b50, 0x3b50, 0x3b4a, 0x3b4a, 0x3b4a, 0x3b4a, 0x3b43, 0x3b43, 0x3b43, 0x3b43,
0x3b3d, 0x3b3d, 0x3b3d, 0x3b3d, 0x3b38, 0x3b38, 0x3b38, 0x3b38, 0x3b32, 0x3b32, 0x3b32, 0x3b32,
0x3b2c, 0x3b2c, 0x3b2c, 0x3b2c, 0x3b27, 0x3b27, 0x3b27, 0x3b27, 0x3b22, 0x3b22, 0x3b22, 0x3b1d,
0x3b1d, 0x3b1d, 0x3b1d, 0x3b18, 0x3b18, 0x3b18, 0x3b18, 0x3b13, 0x3b13, 0x3b13, 0x3b13, 0x3b0f,
0x3b0f, 0x3b0f, 0x3b0f, 0x3b0b, 0x3b0b, 0x3b0b, 0x3b0b, 0x3b06, 0x3b06, 0x3b06, 0x3b06, 0x3b02,
0x3b02, 0x3b02, 0x3b02, 0x3afd, 0x3afd, 0x3afd, 0x3afd, 0x3af5, 0x3af5, 0x3af5, 0x3af5, 0x3aed,
0x3aed, 0x3aed, 0x3aed, 0x3ae6, 0x3ae6, 0x3ae6, 0x3adf, 0x3adf, 0x3adf, 0x3adf, 0x3ad8, 0x3ad8,
0x3ad8, 0x3ad8, 0x3ad1, 0x3ad1, 0x3ad1, 0x3ad1, 0x3acb, 0x3acb, 0x3acb, 0x3acb, 0x3ac5, 0x3ac5,
0x3ac5, 0x3ac5, 0x3abf, 0x3abf, 0x3abf, 0x3abf, 0x3ab9, 0x3ab9, 0x3ab9, 0x3ab9, 0x3ab3, 0x3ab3,
0x3ab3, 0x3ab3, 0x3aae, 0x3aae, 0x3aae, 0x3aae, 0x3aa9, 0x3aa9, 0x3aa9, 0x3aa3, 0x3aa3, 0x3aa3,
0x3aa3, 0x3a9e, 0x3a9e, 0x3a9e, 0x3a9e, 0x3a99, 0x3a99, 0x3a99, 0x3a99, 0x3a94, 0x3a94, 0x3a94,
0x3a94, 0x3a90, 0x3a90, 0x3a90, 0x3a90, 0x3a8c, 0x3a8c, 0x3a8c, 0x3a8c, 0x3a87, 0x3a87, 0x3a87,
0x3a87, 0x3a83, 0x3a83, 0x3a83, 0x3a83, 0x3a7e, 0x3a7e, 0x3a7e, 0x3a7e, 0x3a76, 0x3a76, 0x3a76,
0x3a76, 0x3a6f, 0x3a6f, 0x3a6f, 0x3a68, 0x3a68, 0x3a68, 0x3a68, 0x3a60, 0x3a60, 0x3a60, 0x3a60,
0x3a59, 0x3a59, 0x3a59, 0x3a59, 0x3a53, 0x3a53, 0x3a53, 0x3a53, 0x3a4d, 0x3a4d, 0x3a4d, 0x3a4d,
0x3a46, 0x3a46, 0x3a46, 0x3a46, 0x3a40, 0x3a40, 0x3a40, 0x3a40, 0x3a3a, 0x3a3a, 0x3a3a, 0x3a3a,
0x3a34, 0x3a34, 0x3a34, 0x3a34, 0x3a2f, 0x3a2f, 0x3a2f, 0x3a2f, 0x3a2a, 0x3a2a, 0x3a2a, 0x3a24,
0x3a24, 0x3a24, 0x3a24, 0x3a1f, 0x3a1f, 0x3a1f, 0x3a1f, 0x3a1a, 0x3a1a, 0x3a1a, 0x3a1a, 0x3a15,
0x3a15, 0x3a15, 0x3a15, 0x3a11, 0x3a11, 0x3a11, 0x3a11, 0x3a0d, 0x3a0d, 0x3a0d, 0x3a0d, 0x3a08,
0x3a08, 0x3a08, 0x3a08, 0x3a04, 0x3a04, 0x3a04, 0x3a04, 0x3a00, 0x3a00, 0x3a00, 0x3a00, 0x39f8,
0x39f8, 0x39f8, 0x39f0, 0x39f0, 0x39f0, 0x39f0, 0x39e9, 0x39e9, 0x39e9, 0x39e9, 0x39e2, 0x39e2,
0x39e2, 0x39e2, 0x39db, 0x39db, 0x39db, 0x39db, 0x39d4, 0x39d4, 0x39d4, 0x39d4, 0x39ce, 0x39ce,
0x39ce, 0x39ce, 0x39c7, 0x39c7, 0x39c7, 0x39c7, 0x39c1, 0x39c1, 0x39c1, 0x39c1, 0x39bb, 0x39bb,
0x39bb, 0x39bb, 0x39b5, 0x39b5, 0x39b5, 0x39b5, 0x39b0, 0x39b0,
};
// FIXME: not hard code
// contribute from hw, fix with `PRE_DATA` input
static double sigmode_golden[] = {
0.5, 0.501999989, 0.503999915, 0.505999712, 0.507999317, 0.509998667, 0.511997697,
0.513996342, 0.515994541, 0.517992228, 0.51998934, 0.521985814, 0.523981585, 0.525976591,
0.527970767, 0.529964052, 0.531956381, 0.533947691, 0.535937921, 0.537927006, 0.539914885,
0.541901494, 0.543886772, 0.545870657, 0.547853086, 0.549833997, 0.55181333, 0.553791023,
0.555767014, 0.557741243, 0.559713649, 0.561684172, 0.56365275, 0.565619325, 0.567583836,
0.569546224, 0.571506429, 0.573464394, 0.575420058, 0.577373363, 0.579324252, 0.581272667,
0.583218549, 0.585161842, 0.58710249, 0.589040434, 0.59097562, 0.59290799, 0.594837491,
0.596764066, 0.59868766, 0.60060822, 0.60252569, 0.604440017, 0.606351149, 0.608259031,
0.610163611, 0.612064837, 0.613962657, 0.61585702, 0.617747875, 0.61963517, 0.621518857,
0.623398885, 0.625275204, 0.627147766, 0.629016523, 0.630881426, 0.632742428, 0.634599482,
0.63645254, 0.638301558, 0.640146488, 0.641987286, 0.643823907, 0.645656306, 0.64748444,
0.649308265, 0.651127739, 0.652942818, 0.654753461, 0.656559626, 0.658361272, 0.66015836,
0.661950848, 0.663738697, 0.665521869, 0.667300325, 0.669074026, 0.670842936, 0.672607017,
0.674366233, 0.676120548, 0.677869926, 0.679614333, 0.681353734, 0.683088095, 0.684817383,
0.686541565, 0.688260608, 0.689974481, 0.691683153, 0.693386592, 0.695084769, 0.696777653,
0.698465216, 0.700147429, 0.701824263, 0.703495691, 0.705161686, 0.706822221, 0.70847727,
0.710126808, 0.71177081, 0.71340925, 0.715042106, 0.716669353, 0.718290968, 0.71990693,
0.721517216, 0.723121805, 0.724720676, 0.726313808, 0.727901182, 0.729482779, 0.731058579,
0.732628564, 0.734192716, 0.735751018, 0.737303454, 0.738850006, 0.740390659, 0.741925398,
0.743454208, 0.744977074, 0.746493983, 0.748004922, 0.749509876, 0.751008835, 0.752501785,
0.753988716, 0.755469617, 0.756944477, 0.758413287, 0.759876035, 0.761332715, 0.762783316,
0.764227831, 0.765666252, 0.767098572, 0.768524783, 0.769944881, 0.771358858, 0.772766709,
0.774168429, 0.775564014, 0.77695346, 0.778336762, 0.779713917, 0.781084923, 0.782449776,
0.783808476, 0.78516102, 0.786507407, 0.787847636, 0.789181707, 0.790509619, 0.791831373,
0.79314697, 0.794456411, 0.795759698, 0.797056831, 0.798347814, 0.79963265, 0.80091134,
0.802183889, 0.803450299, 0.804710577, 0.805964724, 0.807212748, 0.808454651, 0.809690441,
0.810920123, 0.812143702, 0.813361186, 0.814572581, 0.815777894, 0.816977132, 0.818170304,
0.819357418, 0.820538481, 0.821713502, 0.82288249, 0.824045455, 0.825202406, 0.826353353,
0.827498306, 0.828637274, 0.82977027, 0.830897303, 0.832018385, 0.833133528, 0.834242742,
0.83534604, 0.836443435, 0.837534937, 0.838620561, 0.83970032, 0.840774225, 0.841842291,
0.842904531, 0.843960959, 0.84501159, 0.846056436, 0.847095514, 0.848128836, 0.84915642,
0.850178278, 0.851194427, 0.852204883, 0.85320966, 0.854208775, 0.855202244, 0.856190082,
0.857172307, 0.858148935, 0.859119982, 0.860085466, 0.861045403, 0.861999811, 0.862948707,
0.863892109, 0.864830034, 0.8657625, 0.866689525, 0.867611126, 0.868527324, 0.869438134,
0.870343577, 0.871243671, 0.872138434, 0.873027885, 0.873912043, 0.874790928, 0.875664558,
0.876532952, 0.877396131, 0.878254114, 0.879106919, 0.879954567, 0.880797078, 0.881634471,
0.882466767, 0.883293985, 0.884116145, 0.884933268, 0.885745374, 0.886552483, 0.887354615,
0.888151792, 0.888944033, 0.88973136, 0.890513792, 0.89129135, 0.892064056, 0.89283193,
0.893594992, 0.894353264, 0.895106767, 0.895855521, 0.896599549, 0.897338869, 0.898073505,
0.898803476, 0.899528804, 0.900249511, 0.900965617, 0.901677143, 0.902384111, 0.903086543,
0.903784458, 0.90447788, 0.905166828, 0.905851324, 0.90653139, 0.907207047, 0.907878316,
0.908545218, 0.909207776, 0.90986601, 0.910519941, 0.911169591, 0.911814981, 0.912456133,
0.913093067, 0.913725806, 0.914354369, 0.91497878, 0.915599058, 0.916215226, 0.916827304,
0.917435313, 0.918039275, 0.91863921, 0.919235141, 0.919827088, 0.920415072, 0.920999114,
0.921579235, 0.922155456, 0.922727798, 0.923296282, 0.923860929, 0.92442176, 0.924978795,
0.925532055, 0.926081561, 0.926627334, 0.927169394, 0.927707762, 0.928242458, 0.928773503,
0.929300917, 0.929824721, 0.930344935, 0.93086158, 0.931374675, 0.931884241, 0.932390297,
0.932892865, 0.933391964, 0.933887615, 0.934379836, 0.934868648, 0.93535407, 0.935836124,
0.936314827, 0.9367902, 0.937262263, 0.937731034, 0.938196534, 0.938658781, 0.939117796,
0.939573597, 0.940026203, 0.940475634, 0.940921909, 0.941365046, 0.941805065, 0.942241985,
0.942675824, 0.943106601, 0.943534335, 0.943959044, 0.944380747, 0.944799462, 0.945215208,
0.945628003, 0.946037865, 0.946444813, 0.946848864, 0.947250036, 0.947648348, 0.948043817,
0.948436462, 0.948826299, 0.949213347, 0.949597623, 0.949979144, 0.950357929, 0.950733994,
0.951107357, 0.951478034, 0.951846044, 0.952211402, 0.952574127, 0.952934234, 0.953291742,
0.953646665, 0.953999022, 0.954348829, 0.954696102, 0.955040858, 0.955383113, 0.955722883,
0.956060185, 0.956395034, 0.956727447, 0.95705744, 0.957385028, 0.957710228, 0.958033055,
0.958353525, 0.958671653, 0.958987455, 0.959300946, 0.959612142, 0.959921058, 0.960227709,
0.960532111, 0.960834277, 0.961134224, 0.961431966, 0.961727518, 0.962020894, 0.962312109,
0.962601179, 0.962888117, 0.963172937, 0.963455655, 0.963736284, 0.964014838, 0.964291332,
0.96456578, 0.964838195, 0.965108591, 0.965376983, 0.965643384, 0.965907808, 0.966170267,
0.966430777, 0.966689349, 0.966945998, 0.967200737, 0.967453578, 0.967704535, 0.967953622,
0.96820085, 0.968446233, 0.968689784, 0.968931516, 0.96917144, 0.969409571, 0.969645919,
0.969880498, 0.97011332, 0.970344398, 0.970573743, 0.970801367, 0.971027284, 0.971251504,
0.97147404, 0.971694904, 0.971914107, 0.972131661, 0.972347578, 0.972561869, 0.972774546,
0.97298562, 0.973195103, 0.973403006, 0.973609341, 0.973814117, 0.974017347, 0.974219042,
0.974419212, 0.974617868, 0.974815021, 0.975010683, 0.975204863, 0.975397572, 0.97558882,
0.975778619, 0.975966979, 0.97615391, 0.976339422, 0.976523525, 0.97670623, 0.976887547,
0.977067486, 0.977246057, 0.977423269, 0.977599132, 0.977773657, 0.977946853, 0.978118729,
0.978289296, 0.978458562, 0.978626537, 0.978793231, 0.978958653, 0.979122812, 0.979285717,
0.979447378, 0.979607804, 0.979767003, 0.979924985, 0.980081758, 0.980237332, 0.980391715,
0.980544915, 0.980696943, 0.980847805, 0.980997512, 0.981146071, 0.98129349, 0.981439779,
0.981584945, 0.981728996, 0.981871942, 0.98201379, 0.982154548, 0.982294225, 0.982432827,
0.982570364, 0.982706843, 0.982842273, 0.982976659, 0.983110012, 0.983242337, 0.983373644,
0.983503939, 0.983633229, 0.983761524, 0.983888829, 0.984015152, 0.9841405, 0.984264882,
0.984388303, 0.984510772, 0.984632294, 0.984752879, 0.984872531, 0.984991259, 0.985109069,
0.985225968, 0.985341963, 0.985457061, 0.985571269, 0.985684592, 0.985797039, 0.985908614,
0.986019326, 0.98612918, 0.986238183, 0.986346341, 0.986453661, 0.986560148, 0.98666581,
0.986770653, 0.986874682, 0.986977903, 0.987080324, 0.98718195, 0.987282786, 0.987382839,
0.987482115, 0.98758062, 0.98767836, 0.987775339, 0.987871565, 0.987967043, 0.988061778,
0.988155776, 0.988249042, 0.988341583, 0.988433404, 0.98852451, 0.988614907, 0.9887046,
0.988793594, 0.988881895, 0.988969507, 0.989056437, 0.98914269, 0.98922827, 0.989313183,
0.989397433, 0.989481027, 0.989563968, 0.989646262, 0.989727914, 0.989808929, 0.989889312,
0.989969066, 0.990048198, 0.990126712, 0.990204613, 0.990281905, 0.990358593, 0.990434681,
0.990510175, 0.990585079, 0.990659397, 0.990733134, 0.990806295, 0.990878883, 0.990950903,
0.99102236, 0.991093257, 0.9911636, 0.991233391, 0.991302637, 0.99137134, 0.991439506,
0.991507137, 0.991574239, 0.991640815, 0.991706869, 0.991772406, 0.991837429, 0.991901942,
0.99196595, 0.992029456, 0.992092463, 0.992154977, 0.992217, 0.992278537, 0.992339591,
0.992400166, 0.992460265, 0.992519893, 0.992579053, 0.992637749, 0.992695983, 0.99275376,
0.992811084, 0.992867957, 0.992924384, 0.992980367, 0.993035911, 0.993091018, 0.993145692,
0.993199936, 0.993253754, 0.993307149, 0.993360124, 0.993412683, 0.993464828, 0.993516563,
0.993567892, 0.993618816, 0.99366934, 0.993719466, 0.993769198, 0.993818539, 0.993867491,
0.993916059, 0.993964243, 0.994012049, 0.994059478, 0.994106533, 0.994153219, 0.994199536,
0.994245489, 0.994291079, 0.994336311, 0.994381186, 0.994425708, 0.994469878, 0.994513701,
0.994557178, 0.994600313, 0.994643108, 0.994685565, 0.994727688, 0.994769478, 0.994810939,
0.994852073, 0.994892883, 0.994933371, 0.994973539, 0.995013391, 0.995052928, 0.995092153,
0.995131069, 0.995169677, 0.995207981, 0.995245983, 0.995283685, 0.995321089, 0.995358198,
0.995395014, 0.995431539, 0.995467776, 0.995503727, 0.995539394, 0.995574779, 0.995609885,
0.995644713, 0.995679266, 0.995713547, 0.995747556, 0.995781297, 0.995814772, 0.995847981,
0.995880929, 0.995913616, 0.995946044, 0.995978217, 0.996010135, 0.996041801, 0.996073216,
0.996104383, 0.996135304, 0.99616598, 0.996196413, 0.996226606, 0.996256561, 0.996286278,
0.99631576, 0.996345009, 0.996374027, 0.996402815, 0.996431375, 0.99645971, 0.99648782,
0.996515708, 0.996543375, 0.996570823, 0.996598054, 0.99662507, 0.996651872, 0.996678461,
0.99670484, 0.99673101, 0.996756974, 0.996782731, 0.996808285, 0.996833636, 0.996858787,
0.996883738, 0.996908492, 0.99693305, 0.996957413, 0.996981584, 0.997005563, 0.997029352,
0.997052952, 0.997076366, 0.997099594, 0.997122638, 0.9971455, 0.99716818, 0.997190681,
0.997213004, 0.997235149, 0.99725712, 0.997278916, 0.997300539, 0.997321991, 0.997343273,
0.997364386, 0.997385332, 0.997406112, 0.997426727, 0.997447179, 0.997467468, 0.997487597,
0.997507566, 0.997527377, 0.997547031, 0.997566528, 0.997585872, 0.997605062, 0.997624099,
0.997642986, 0.997661723, 0.997680312, 0.997698752, 0.997717047, 0.997735197, 0.997753202,
0.997771065, 0.997788786, 0.997806367, 0.997823808, 0.99784111, 0.997858276, 0.997875305,
0.997892199, 0.997908959, 0.997925586, 0.997942081, 0.997958445, 0.99797468, 0.997990785,
0.998006763, 0.998022614, 0.998038339, 0.998053939, 0.998069415, 0.998084769, 0.998100001,
0.998115112, 0.998130102, 0.998144974, 0.998159728, 0.998174365, 0.998188885, 0.99820329,
0.998217581, 0.998231759, 0.998245823, 0.998259777, 0.998273619, 0.998287351, 0.998300975,
0.99831449, 0.998327898, 0.998341199, 0.998354395, 0.998367486, 0.998380473, 0.998393356,
0.998406138, 0.998418818, 0.998431397, 0.998443876, 0.998456256, 0.998468538, 0.998480723,
0.99849281, 0.998504802, 0.998516698, 0.998528499, 0.998540207, 0.998551822, 0.998563345,
0.998574776, 0.998586116, 0.998597366, 0.998608527, 0.998619599, 0.998630583, 0.99864148,
0.99865229, 0.998663015, 0.998673654, 0.998684208, 0.998694679, 0.998705066, 0.998715371,
0.998725594, 0.998735736, 0.998745797, 0.998755778, 0.99876568, 0.998775503, 0.998785248,
0.998794916, 0.998804507, 0.998814021, 0.99882346, 0.998832824, 0.998842113, 0.998851329,
0.998860471, 0.998869541, 0.998878538, 0.998887464, 0.998896319, 0.998905104, 0.998913818,
0.998922464, 0.99893104, 0.998939549, 0.99894799, 0.998956364, 0.998964671, 0.998972912,
0.998981088, 0.998989198, 0.998997244, 0.999005226, 0.999013145, 0.999021001, 0.999028794,
0.999036525, 0.999044195, 0.999051803, 0.999059352, 0.99906684, 0.999074268, 0.999081638,
0.999088949, 0.999096202, 0.999103397, 0.999110535, 0.999117616, 0.99912464, 0.999131609,
0.999138523, 0.999145381, 0.999152185, 0.999158935, 0.999165631, 0.999172274, 0.999178864,
0.999185401, 0.999191887, 0.999198321, 0.999204704, 0.999211036, 0.999217317, 0.999223549,
0.999229731, 0.999235864, 0.999241948, 0.999247984, 0.999253971, 0.999259911, 0.999265804,
0.99927165, 0.999277449, 0.999283202, 0.99928891, 0.999294572, 0.999300189, 0.999305761,
0.999311289, 0.999316773, 0.999322213, 0.99932761, 0.999332964, 0.999338276, 0.999343545,
0.999348772, 0.999353958, 0.999359103, 0.999364206, 0.999369269, 0.999374291, 0.999379274,
0.999384217, 0.999389121, 0.999393985, 0.999398811, 0.999403599, 0.999408348, 0.99941306,
0.999417734, 0.99942237, 0.99942697, 0.999431534, 0.999436061, 0.999440552, 0.999445007,
0.999449427, 0.999453811, 0.999458161, 0.999462476, 0.999466757, 0.999471004, 0.999475217,
0.999479396, 0.999483542, 0.999487655, 0.999491735, 0.999495783, 0.999499799, 0.999503783,
0.999507735, 0.999511655, 0.999515544, 0.999519403, 0.99952323, 0.999527027, 0.999530794,
0.999534531, 0.999538238, 0.999541916, 0.999545564, 0.999549184, 0.999552774, 0.999556336,
0.99955987, 0.999563375, 0.999566853, 0.999570303, 0.999573725, 0.99957712, 0.999580488,
0.99958383, 0.999587145, 0.999590433, 0.999593695, 0.999596931, 0.999600142, 0.999603326,
0.999606486, 0.99960962, 0.99961273, 0.999615814, 0.999618874, 0.99962191, 0.999624921,
0.999627909, 0.999630873, 0.999633813, 0.99963673, 0.999639623, 0.999642494, 0.999645341,
0.999648166, 0.999650969, 0.999653749, 0.999656507, 0.999659243, 0.999661957, 0.498000011,
0.496000085, 0.494000288, 0.492000683, 0.490001333, 0.488002303, 0.486003658, 0.484005459,
0.482007772, 0.48001066, 0.478014186, 0.476018415, 0.474023409, 0.472029233, 0.470035948,
0.468043619, 0.466052309, 0.464062079, 0.462072994, 0.460085115, 0.458098506, 0.456113228,
0.454129343, 0.452146914, 0.450166003, 0.44818667, 0.446208977, 0.444232986, 0.442258757,
0.440286351, 0.438315828, 0.43634725, 0.434380675, 0.432416164, 0.430453776, 0.428493571,
0.426535606, 0.424579942, 0.422626637, 0.420675748, 0.418727333, 0.416781451, 0.414838158,
0.41289751, 0.410959566, 0.40902438, 0.40709201, 0.405162509, 0.403235934, 0.40131234,
0.39939178, 0.39747431, 0.395559983, 0.393648851, 0.391740969, 0.389836389, 0.387935163,
0.386037343, 0.38414298, 0.382252125, 0.38036483, 0.378481143, 0.376601115, 0.374724796,
0.372852234, 0.370983477, 0.369118574, 0.367257572, 0.365400518, 0.36354746, 0.361698442,
0.359853512, 0.358012714, 0.356176093, 0.354343694, 0.35251556, 0.350691735, 0.348872261,
0.347057182, 0.345246539, 0.343440374, 0.341638728, 0.33984164, 0.338049152, 0.336261303,
0.334478131, 0.332699675, 0.330925974, 0.329157064, 0.327392983, 0.325633767, 0.323879452,
0.322130074, 0.320385667, 0.318646266, 0.316911905, 0.315182617, 0.313458435, 0.311739392,
0.310025519, 0.308316847, 0.306613408, 0.304915231, 0.303222347, 0.301534784, 0.299852571,
0.298175737, 0.296504309, 0.294838314, 0.293177779, 0.29152273, 0.289873192, 0.28822919,
0.28659075, 0.284957894, 0.283330647, 0.281709032, 0.28009307, 0.278482784, 0.276878195,
0.275279324, 0.273686192, 0.272098818, 0.270517221, 0.268941421, 0.267371436, 0.265807284,
0.264248982, 0.262696546, 0.261149994, 0.259609341, 0.258074602, 0.256545792, 0.255022926,
0.253506017, 0.251995078, 0.250490124, 0.248991165, 0.247498215, 0.246011284, 0.244530383,
0.243055523, 0.241586713, 0.240123965, 0.238667285, 0.237216684, 0.235772169, 0.234333748,
0.232901428, 0.231475217, 0.230055119, 0.228641142, 0.227233291, 0.225831571, 0.224435986,
0.22304654, 0.221663238, 0.220286083, 0.218915077, 0.217550224, 0.216191524, 0.21483898,
0.213492593, 0.212152364, 0.210818293, 0.209490381, 0.208168627, 0.20685303, 0.205543589,
0.204240302, 0.202943169, 0.201652186, 0.20036735, 0.19908866, 0.197816111, 0.196549701,
0.195289423, 0.194035276, 0.192787252, 0.191545349, 0.190309559, 0.189079877, 0.187856298,
0.186638814, 0.185427419, 0.184222106, 0.183022868, 0.181829696, 0.180642582, 0.179461519,
0.178286498, 0.17711751, 0.175954545, 0.174797594, 0.173646647, 0.172501694, 0.171362726,
0.17022973, 0.169102697, 0.167981615, 0.166866472, 0.165757258, 0.16465396, 0.163556565,
0.162465063, 0.161379439, 0.16029968, 0.159225775, 0.158157709, 0.157095469, 0.156039041,
0.15498841, 0.153943564, 0.152904486, 0.151871164, 0.15084358, 0.149821722, 0.148805573,
0.147795117, 0.14679034, 0.145791225, 0.144797756, 0.143809918, 0.142827693, 0.141851065,
0.140880018, 0.139914534, 0.138954597, 0.138000189, 0.137051293, 0.136107891, 0.135169966,
0.1342375, 0.133310475, 0.132388874, 0.131472676, 0.130561866, 0.129656423, 0.128756329,
0.127861566, 0.126972115, 0.126087957, 0.125209072, 0.124335442, 0.123467048, 0.122603869,
0.121745886, 0.120893081, 0.120045433, 0.119202922, 0.118365529, 0.117533233, 0.116706015,
0.115883855, 0.115066732, 0.114254626, 0.113447517, 0.112645385, 0.111848208, 0.111055967,
0.11026864, 0.109486208, 0.10870865, 0.107935944, 0.10716807, 0.106405008, 0.105646736,
0.104893233, 0.104144479, 0.103400451, 0.102661131, 0.101926495, 0.101196524, 0.100471196,
0.099750489, 0.099034383, 0.098322857, 0.097615889, 0.096913457, 0.096215542, 0.09552212,
0.094833172, 0.094148676, 0.09346861, 0.092792953, 0.092121684, 0.091454782, 0.090792224,
0.09013399, 0.089480059, 0.088830409, 0.088185019, 0.087543867, 0.086906933, 0.086274194,
0.085645631, 0.08502122, 0.084400942, 0.083784774, 0.083172696, 0.082564687, 0.081960725,
0.08136079, 0.080764859, 0.080172912, 0.079584928, 0.079000886, 0.078420765, 0.077844544,
0.077272202, 0.076703718, 0.076139071, 0.07557824, 0.075021205, 0.074467945, 0.073918439,
0.073372666, 0.072830606, 0.072292238, 0.071757542, 0.071226497, 0.070699083, 0.070175279,
0.069655065, 0.06913842, 0.068625325, 0.068115759, 0.067609703, 0.067107135, 0.066608036,
0.066112385, 0.065620164, 0.065131352, 0.06464593, 0.064163876, 0.063685173, 0.0632098,
0.062737737, 0.062268966, 0.061803466, 0.061341219, 0.060882204, 0.060426403, 0.059973797,
0.059524366, 0.059078091, 0.058634954, 0.058194935, 0.057758015, 0.057324176, 0.056893399,
0.056465665, 0.056040956, 0.055619253, 0.055200538, 0.054784792, 0.054371997, 0.053962135,
0.053555187, 0.053151136, 0.052749964, 0.052351652, 0.051956183, 0.051563538, 0.051173701,
0.050786653, 0.050402377, 0.050020856, 0.049642071, 0.049266006, 0.048892643, 0.048521966,
0.048153956, 0.047788598, 0.047425873, 0.047065766, 0.046708258, 0.046353335, 0.046000978,
0.045651171, 0.045303898, 0.044959142, 0.044616887, 0.044277117, 0.043939815, 0.043604966,
0.043272553, 0.04294256, 0.042614972, 0.042289772, 0.041966945, 0.041646475, 0.041328347,
0.041012545, 0.040699054, 0.040387858, 0.040078942, 0.039772291, 0.039467889, 0.039165723,
0.038865776, 0.038568034, 0.038272482, 0.037979106, 0.037687891, 0.037398821, 0.037111883,
0.036827063, 0.036544345, 0.036263716, 0.035985162, 0.035708668, 0.03543422, 0.035161805,
0.034891409, 0.034623017, 0.034356616, 0.034092192, 0.033829733, 0.033569223, 0.033310651,
0.033054002, 0.032799263, 0.032546422, 0.032295465, 0.032046378, 0.03179915, 0.031553767,
0.031310216, 0.031068484, 0.03082856, 0.030590429, 0.030354081, 0.030119502, 0.02988668,
0.029655602, 0.029426257, 0.029198633, 0.028972716, 0.028748496, 0.02852596, 0.028305096,
0.028085893, 0.027868339, 0.027652422, 0.027438131, 0.027225454, 0.02701438, 0.026804897,
0.026596994, 0.026390659, 0.026185883, 0.025982653, 0.025780958, 0.025580788, 0.025382132,
0.025184979, 0.024989317, 0.024795137, 0.024602428, 0.02441118, 0.024221381, 0.024033021,
0.02384609, 0.023660578, 0.023476475, 0.02329377, 0.023112453, 0.022932514, 0.022753943,
0.022576731, 0.022400868, 0.022226343, 0.022053147, 0.021881271, 0.021710704, 0.021541438,
0.021373463, 0.021206769, 0.021041347, 0.020877188, 0.020714283, 0.020552622, 0.020392196,
0.020232997, 0.020075015, 0.019918242, 0.019762668, 0.019608285, 0.019455085, 0.019303057,
0.019152195, 0.019002488, 0.018853929, 0.01870651, 0.018560221, 0.018415055, 0.018271004,
0.018128058, 0.01798621, 0.017845452, 0.017705775, 0.017567173, 0.017429636, 0.017293157,
0.017157727, 0.017023341, 0.016889988, 0.016757663, 0.016626356, 0.016496061, 0.016366771,
0.016238476, 0.016111171, 0.015984848, 0.0158595, 0.015735118, 0.015611697, 0.015489228,
0.015367706, 0.015247121, 0.015127469, 0.015008741, 0.014890931, 0.014774032, 0.014658037,
0.014542939, 0.014428731, 0.014315408, 0.014202961, 0.014091386, 0.013980674, 0.01387082,
0.013761817, 0.013653659, 0.013546339, 0.013439852, 0.01333419, 0.013229347, 0.013125318,
0.013022097, 0.012919676, 0.01281805, 0.012717214, 0.012617161, 0.012517885, 0.01241938,
0.01232164, 0.012224661, 0.012128435, 0.012032957, 0.011938222, 0.011844224, 0.011750958,
0.011658417, 0.011566596, 0.01147549, 0.011385093, 0.0112954, 0.011206406, 0.011118105,
0.011030493, 0.010943563, 0.01085731, 0.01077173, 0.010686817, 0.010602567, 0.010518973,
0.010436032, 0.010353738, 0.010272086, 0.010191071, 0.010110688, 0.010030934, 0.009951802,
0.009873288, 0.009795387, 0.009718095, 0.009641407, 0.009565319, 0.009489825, 0.009414921,
0.009340603, 0.009266866, 0.009193705, 0.009121117, 0.009049097, 0.00897764, 0.008906743,
0.0088364, 0.008766609, 0.008697363, 0.00862866, 0.008560494, 0.008492863, 0.008425761,
0.008359185, 0.008293131, 0.008227594, 0.008162571, 0.008098058, 0.00803405, 0.007970544,
0.007907537, 0.007845023, 0.007783, 0.007721463, 0.007660409, 0.007599834, 0.007539735,
0.007480107, 0.007420947, 0.007362251, 0.007304017, 0.00724624, 0.007188916, 0.007132043,
0.007075616, 0.007019633, 0.006964089, 0.006908982, 0.006854308, 0.006800064, 0.006746246,
0.006692851, 0.006639876, 0.006587317, 0.006535172, 0.006483437, 0.006432108, 0.006381184,
0.00633066, 0.006280534, 0.006230802, 0.006181461, 0.006132509, 0.006083941, 0.006035757,
0.005987951, 0.005940522, 0.005893467, 0.005846781, 0.005800464, 0.005754511, 0.005708921,
0.005663689, 0.005618814, 0.005574292, 0.005530122, 0.005486299, 0.005442822, 0.005399687,
0.005356892, 0.005314435, 0.005272312, 0.005230522, 0.005189061, 0.005147927, 0.005107117,
0.005066629, 0.005026461, 0.004986609, 0.004947072, 0.004907847, 0.004868931, 0.004830323,
0.004792019, 0.004754017, 0.004716315, 0.004678911, 0.004641802, 0.004604986, 0.004568461,
0.004532224, 0.004496273, 0.004460606, 0.004425221, 0.004390115, 0.004355287, 0.004320734,
0.004286453, 0.004252444, 0.004218703, 0.004185228, 0.004152019, 0.004119071, 0.004086384,
0.004053956, 0.004021783, 0.003989865, 0.003958199, 0.003926784, 0.003895617, 0.003864696,
0.00383402, 0.003803587, 0.003773394, 0.003743439, 0.003713722, 0.00368424, 0.003654991,
0.003625973, 0.003597185, 0.003568625, 0.00354029, 0.00351218, 0.003484292, 0.003456625,
0.003429177, 0.003401946, 0.00337493, 0.003348128, 0.003321539, 0.00329516, 0.00326899,
0.003243026, 0.003217269, 0.003191715, 0.003166364, 0.003141213, 0.003116262, 0.003091508,
0.00306695, 0.003042587, 0.003018416, 0.002994437, 0.002970648, 0.002947048, 0.002923634,
0.002900406, 0.002877362, 0.0028545, 0.00283182, 0.002809319, 0.002786996, 0.002764851,
0.00274288, 0.002721084, 0.002699461, 0.002678009, 0.002656727, 0.002635614, 0.002614668,
0.002593888, 0.002573273, 0.002552821, 0.002532532, 0.002512403, 0.002492434, 0.002472623,
0.002452969, 0.002433472, 0.002414128, 0.002394938, 0.002375901, 0.002357014, 0.002338277,
0.002319688, 0.002301248, 0.002282953, 0.002264803, 0.002246798, 0.002228935, 0.002211214,
0.002193633, 0.002176192, 0.00215889, 0.002141724, 0.002124695, 0.002107801, 0.002091041,
0.002074414, 0.002057919, 0.002041555, 0.00202532, 0.002009215, 0.001993237, 0.001977386,
0.001961661, 0.001946061, 0.001930585, 0.001915231, 0.001899999, 0.001884888, 0.001869898,
0.001855026, 0.001840272, 0.001825635, 0.001811115, 0.00179671, 0.001782419, 0.001768241,
0.001754177, 0.001740223, 0.001726381, 0.001712649, 0.001699025, 0.00168551, 0.001672102,
0.001658801, 0.001645605, 0.001632514, 0.001619527, 0.001606644, 0.001593862, 0.001581182,
0.001568603, 0.001556124, 0.001543744, 0.001531462, 0.001519277, 0.00150719, 0.001495198,
0.001483302, 0.001471501, 0.001459793, 0.001448178, 0.001436655, 0.001425224, 0.001413884,
0.001402634, 0.001391473, 0.001380401, 0.001369417, 0.00135852, 0.00134771, 0.001336985,
0.001326346, 0.001315792, 0.001305321, 0.001294934, 0.001284629, 0.001274406, 0.001264264,
0.001254203, 0.001244222, 0.00123432, 0.001224497, 0.001214752, 0.001205084, 0.001195493,
0.001185979, 0.00117654, 0.001167176, 0.001157887, 0.001148671, 0.001139529, 0.001130459,
0.001121462, 0.001112536, 0.001103681, 0.001094896, 0.001086182, 0.001077536, 0.00106896,
0.001060451, 0.00105201, 0.001043636, 0.001035329, 0.001027088, 0.001018912, 0.001010802,
0.001002756, 0.000994774, 0.000986855, 0.000978999, 0.000971206, 0.000963475, 0.000955805,
0.000948197, 0.000940648, 0.00093316, 0.000925732, 0.000918362, 0.000911051, 0.000903798,
0.000896603, 0.000889465, 0.000882384, 0.00087536, 0.000868391, 0.000861477, 0.000854619,
0.000847815, 0.000841065, 0.000834369, 0.000827726, 0.000821136, 0.000814599, 0.000808113,
0.000801679, 0.000795296, 0.000788964, 0.000782683, 0.000776451, 0.000770269, 0.000764136,
0.000758052, 0.000752016, 0.000746029, 0.000740089, 0.000734196, 0.00072835, 0.000722551,
0.000716798, 0.00071109, 0.000705428, 0.000699811, 0.000694239, 0.000688711, 0.000683227,
0.000677787, 0.00067239, 0.000667036, 0.000661724, 0.000656455, 0.000651228, 0.000646042,
0.000640897, 0.000635794, 0.000630731, 0.000625709, 0.000620726, 0.000615783, 0.000610879,
0.000606015, 0.000601189, 0.000596401, 0.000591652, 0.00058694, 0.000582266, 0.00057763,
0.00057303, 0.000568466, 0.000563939, 0.000559448, 0.000554993, 0.000550573, 0.000546189,
0.000541839, 0.000537524, 0.000533243, 0.000528996, 0.000524783, 0.000520604, 0.000516458,
0.000512345, 0.000508265, 0.000504217, 0.000500201, 0.000496217, 0.000492265, 0.000488345,
0.000484456, 0.000480597, 0.00047677, 0.000472973, 0.000469206, 0.000465469, 0.000461762,
0.000458084, 0.000454436, 0.000450816, 0.000447226, 0.000443664, 0.00044013, 0.000436625,
0.000433147, 0.000429697, 0.000426275, 0.00042288, 0.000419512, 0.00041617, 0.000412855,
0.000409567, 0.000406305, 0.000403069, 0.000399858, 0.000396674, 0.000393514, 0.00039038,
0.00038727, 0.000384186, 0.000381126, 0.00037809, 0.000375079, 0.000372091, 0.000369127,
0.000366187, 0.00036327, 0.000360377, 0.000357506, 0.000354659, 0.000351834, 0.000349031,
0.000346251, 0.000343493, 0.000340757, 0.000338043, 0.00033535};
// static bool check_input_int8_range(float input)
//{
// bool ret = input > -128.0 && input < 128.0;
// if (!ret) {
// printf("invalid int8 range, input is %f\n", input);
// }
// return ret;
//}
static double _gen_sigmoid(float x) { return 1.0 / (1.0 + exp(-(x))); }
static void tl_lut_ref(uint16_t *ofmap, uint16_t *ifmap, uint16_t *table, uint16_t *table_slope,
cvk_tl_shape_t ifmap_shape, cvk_tl_shape_t table_shape, int range_start,
int range_end) {
int tn, th, tw;
tn = table_shape.n;
th = table_shape.h;
tw = table_shape.w;
(void)tn;
(void)th;
(void)tw;
(void)table;
(void)table_slope;
(void)range_start;
(void)range_end;
assert(tn == 1);
assert(th * tw == 256);
assert(table);
assert(table_slope);
assert(ifmap_shape.n);
assert(ifmap);
assert(ofmap);
// TODO: use c function
// 1. dump all input as binary file
#ifdef GDB
#define INFP32FILE "infp32file.bin"
#define OUTBF16FILE "lutbf16out.bin"
FILE *pFile;
pFile = fopen(INFP32FILE, "wb");
int shape_sz = tl_shape_size(&ifmap_shape);
float *f = new float[shape_sz];
for (int i = 0; i < shape_sz; i++) {
f[i] = convert_bf16_fp32(ifmap[i]);
}
fwrite(f, 1, shape_sz * sizeof(float), pFile);
fclose(pFile);
// 2. read result from `eval_lut.py`
char command[256];
sprintf(command,
"python eval_lut.py --lut_input_range_start %d --lut_input_range_end "
"%d --inputfloat32 %s --outputbf16 %s 2>&1 > 2\n",
range_start, range_end, INFP32FILE, OUTBF16FILE);
int r;
r = system(command);
printf("command is %s, return %d\n", command, r);
assert(r != 0);
pFile = fopen(OUTBF16FILE, "rb");
if (!pFile) {
fprintf(stderr, "open golden %s fail\n", OUTBF16FILE);
exit(-1);
}
size_t file_length;
file_length = fread(ofmap, sizeof(uint16_t), tl_shape_size(&ifmap_shape), pFile);
printf("read from golden, file size %lu\n", file_length);
fclose(pFile);
#else
assert(range_start);
assert(range_end);
for (uint64_t i = 0; i < tl_shape_size(&ifmap_shape); i++) {
ofmap[i] = convert_fp32_bf16(_gen_sigmoid(convert_bf16_fp32(ifmap[i])));
}
#endif
#ifdef GDB
for (uint64_t i = 0; i < tl_shape_size(&ifmap_shape); i++) {
printf("ref %lu input 0x%x(%f) golden 0x%x(%f)\n", i, ifmap[i], convert_bf16_fp32(ifmap[i]),
ofmap[i], convert_bf16_fp32(ofmap[i]));
}
#endif
}
static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint64_t ofmap_size) {
int count = 0;
uint64_t size = ofmap_size;
if (mode == PRE_DATA_COMPARE_FIX) {
size = sizeof(sigmode_golden_bf16) / sizeof(uint16_t);
} else if (PRE_DATA_MAX_ERROR) {
size = sizeof(sigmode_golden) / sizeof(double);
}
for (uint64_t i = 0; i < size; i++) {
if (mode == PRE_DATA_COMPARE_FIX) {
if (ofmap_data[i] != sigmode_golden_bf16[i]) {
fprintf(stderr, "[%d] comparing failed at ofmap_data[%lu], got %x, exp %x\n", count, i,
ofmap_data[i], sigmode_golden_bf16[i]);
exit(-1);
}
} else {
float got = convert_bf16_fp32(ofmap_data[i]);
float exp = convert_bf16_fp32(ref_data[i]);
if (mode == PRE_DATA_MAX_ERROR) {
// cus we have better accuracy ~ 0.0039
exp = sigmode_golden[i];
}
if (fabs(got - exp) > MAX_ERROR) {
fprintf(stderr,
"[%d] comparing failed at ofmap_data[%lu], got %x, exp %x, "
"diff(%f - %f) is %f\n",
count, i, ofmap_data[i], ref_data[i], got, exp, fabs(got - exp));
count++;
}
}
}
if (count != 0) {
printf("error count is %d\n", count);
exit(-1);
}
return true;
}
static void gen_input(uint16_t *ifmap, uint64_t ifmap_size) {
if (mode == PRE_DATA_COMPARE_FIX || mode == PRE_DATA_MAX_ERROR) {
memcpy(ifmap, &test_pattern, sizeof(test_pattern));
#ifdef GDB
for (uint64_t i = 0; i < ifmap_size; i++) {
printf("source if[%lu] is bf16 %f (bf16)with 0x%x\n", i, convert_bf16_fp32(ifmap[i]),
ifmap[i]);
}
#endif
} else {
int table_hw = 256;
for (uint64_t i = 0; i < ifmap_size; i++) {
// input range is -8 ~ +8
float input = ((int)i % 7) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
// float input = ((int)i % 10) * (((int)i % 2) ? 1 : -1) + 0.03 + (i %
// table_hw) * 0.002;
// assert(check_input_int8_range(input));
ifmap[i] = convert_fp32_bf16(input);
#ifdef GDB
printf("source if[%lu] is bf16 %f, input is %f (bf16)with 0x%x\n", i,
convert_bf16_fp32(ifmap[i]), input, ifmap[i]);
#endif
}
}
}
static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) {
// TODO: check more shape / align
cvk_tl_shape_t ifmap_shape;
if (mode == PRE_DATA_COMPARE_FIX || mode == PRE_DATA_MAX_ERROR) {
ifmap_shape = {1, 32, 8, 8};
} else {
ifmap_shape = {1, 32, 16, 16};
}
cvk_fmt_t fmt = CVK_FMT_BF16;
// get table / input shape
cvk_tl_shape_t table_shape;
cvm_table_shape(bmk, &table_shape);
cvk_tl_shape_t ofmap_shape = ifmap_shape;
uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
uint64_t table_size = tl_shape_size(&table_shape);
uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
int data_type_size = bytesize_of_fmt(fmt);
uint64_t ifmap_bytesize = ifmap_size * data_type_size;
uint64_t table_bytesize = table_size * data_type_size;
uint64_t ofmap_bytesize = ofmap_size * data_type_size;
// alloc tg
uint16_t *ifmap = (uint16_t *)xmalloc(ifmap_bytesize);
uint16_t *table_data = (uint16_t *)xmalloc(table_bytesize);
uint16_t *table_data_slope = (uint16_t *)xmalloc(table_bytesize);
uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
// range depend on ur activation
int range_start = -8;
int range_end = 8;
float scale = cvm_sigmoid_scale(range_start, range_end);
// fill tg value
gen_input(ifmap, ifmap_size);
cvm_sigmoid_tbl(table_data, table_data_slope, &table_shape, range_start, range_end);
tl_lut_ref(ref_data, ifmap, table_data, table_data_slope, ifmap_shape, table_shape, range_start,
range_end);
// alloc tl
cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
cvk_tl_t *cvk_tl_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *cvk_tl_table_answer_slope = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
// sys->local
test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)ifmap);
test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer, (uint8_t *)table_data);
test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer_slope, (uint8_t *)table_data_slope);
// emit core function
cvm_emit_sigmoid(bmk, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_slope,
tl_ofmap_bf16, scale);
uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, tl_ofmap_bf16);
verify(ofmap_data, ref_data, ofmap_size);
free_tl(bmk, tl_ofmap_bf16);
free_tl(bmk, tl_buf);
free_tl(bmk, cvk_tl_table_answer_slope);
free_tl(bmk, cvk_tl_table_answer);
free_tl(bmk, tl_ifmap);
free(ifmap);
free(table_data);
free(table_data_slope);
free(ref_data);
free(ofmap_data);
}
int main() {
CVI_RT_HANDLE ctx;
cvk_context_t *bmk;
int round_mode;
round_mode = set_store_feround();
test_init(&ctx, &bmk);
for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) {
// for (int i = GEN_DATA_MAX_ERROR; i < TEST_MODE_MAX; i++) {
// for (int i = PRE_DATA_MAX_ERROR; i < GEN_DATA_MAX_ERROR; i++) {
mode = static_cast<TEST_MODE>(i);
printf("test mode %d...\n", mode);
testbench(&ctx, bmk);
}
test_exit(&ctx, bmk);
restore_feround(round_mode);
return 0;
}

View File

@ -0,0 +1,375 @@
/**
*/
#include <cvimath_internal.h>
#include <test_cvikernel_util.h>
#include <cfloat>
#include <iomanip>
#include <iostream>
#include <map>
#include <random>
#include <string>
//#define DBG
using namespace std;
/**
* pre_data means we test fixed pattern, it should be same sa lut
*/
enum TEST_MODE {
PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare
GEN_POW_20_DATA_MAX_ERROR, // generate 2^-20 ~ 2^20 value that check epsilon
TEST_MODE_MAX,
};
static TEST_MODE mode;
static uint16_t test_pattern[] = {
0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52, 0x3A6C, 0x3A83, 0x3A90,
0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF, 0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17,
0x3B1D, 0x3B24, 0x3B2A, 0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65,
0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90, 0x3B93, 0x3B97, 0x3B9A,
0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE, 0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1,
0x3BC5, 0x3BC8, 0x3BCB, 0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9,
0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03, 0x3C05, 0x3C06, 0x3C08,
0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12, 0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C,
0x3C1D, 0x3C1F, 0x3C21, 0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F,
0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E, 0x3C40, 0x3C41, 0x3C43,
0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D, 0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57,
0x3C58, 0x3C5A, 0x3C5C, 0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A,
0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79, 0x3C7B, 0x3C7C, 0x3C7E,
0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84, 0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89,
0x3C8A, 0x3C8A, 0x3C8B, 0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93,
0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A, 0x3C9B, 0x3C9C, 0x3C9C,
0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1, 0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6,
0x3CA7, 0x3CA8, 0x3CA9, 0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0,
0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8, 0x3CB8, 0x3CB9, 0x3CBA,
0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF, 0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4,
0x3CC5, 0x3CC5, 0x3CC6, 0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE,
0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5, 0x3CD6, 0x3CD7, 0x3CD7,
0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC, 0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1,
0x3CE2, 0x3CE3, 0x3CE4, 0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB,
0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2, 0x3CF3, 0x3CF4, 0x3CF5,
0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA, 0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF,
0x3D00, 0x3D00, 0x3D01, 0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04,
0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08, 0x3D08, 0x3D09, 0x3D09,
0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C, 0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E,
0x3D0F, 0x3D0F, 0x3D0F, 0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13,
0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17, 0x3D17, 0x3D18, 0x3D18,
0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A, 0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D,
0x3D1D, 0x3D1E, 0x3D1E, 0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22,
0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25, 0x3D26, 0x3D26, 0x3D27,
0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29, 0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C,
0x3D2C, 0x3D2C, 0x3D2D, 0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31,
0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34, 0x3D35, 0x3D35, 0x3D35,
0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38, 0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A,
0x3D3B, 0x3D3B, 0x3D3C, 0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F,
0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43, 0x3D43, 0x3D44, 0x3D44,
0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47, 0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49,
0x3D4A, 0x3D4A, 0x3D4A, 0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E,
0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52, 0x3D52, 0x3D53, 0x3D53,
0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55, 0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58,
0x3D58, 0x3D59, 0x3D59, 0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D,
0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60, 0x3D61, 0x3D61, 0x3D62,
0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64, 0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67,
0x3D67, 0x3D67, 0x3D68, 0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C,
0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F, 0x3D70, 0x3D70, 0x3D70,
0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73, 0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75,
0x3D76, 0x3D76, 0x3D77, 0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A,
0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E, 0x3D7E, 0x3D7F, 0x3D7F,
0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82,
0x3D82, 0x3D82, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85,
0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D87, 0x3D87, 0x3D87,
0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89,
0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C,
0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E,
0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91,
0x3D91, 0x3D91, 0x3D91, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93,
0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95, 0x3D95, 0x3D96, 0x3D96,
0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98,
0x3D98, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B,
0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9D, 0x3D9D, 0x3D9D,
0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0,
0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2,
0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4,
0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7,
0x3DA7, 0x3DA7, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9,
0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAC, 0x3DAC,
0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE,
0x3DAE, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1,
0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3,
0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6,
0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8,
0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBB,
0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD,
0x3DBD, 0x3DBD, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF,
0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC2, 0x3DC2, 0x3DC2,
0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4,
0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7,
0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9, 0x3DC9, 0x3DC9, 0x3DC9,
0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC,
0x3DCC, 0x3DCC, 0x3DCC, 0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5,
0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE, 0x3DDF, 0x3DE0, 0x3DE1,
0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5,
};
static uint16_t test_pattern_ref[] = {
0x0, 0x3c24, 0x3c68, 0x3c8e, 0x3ca4, 0x3cb7, 0x3cc8, 0x3cd9, 0x3ce8, 0x3cf6, 0x3d01, 0x3d08,
0x3d0e, 0x3d14, 0x3d19, 0x3d1f, 0x3d24, 0x3d29, 0x3d2e, 0x3d33, 0x3d37, 0x3d3c, 0x3d40, 0x3d45,
0x3d48, 0x3d4d, 0x3d51, 0x3d55, 0x3d59, 0x3d5d, 0x3d61, 0x3d64, 0x3d68, 0x3d6b, 0x3d6f, 0x3d72,
0x3d76, 0x3d79, 0x3d7c, 0x3d80, 0x3d81, 0x3d83, 0x3d85, 0x3d86, 0x3d88, 0x3d89, 0x3d8b, 0x3d8c,
0x3d8e, 0x3d90, 0x3d91, 0x3d92, 0x3d94, 0x3d95, 0x3d97, 0x3d98, 0x3d99, 0x3d9b, 0x3d9c, 0x3d9d,
0x3d9f, 0x3da0, 0x3da1, 0x3da2, 0x3da4, 0x3da5, 0x3da6, 0x3da8, 0x3da9, 0x3daa, 0x3dab, 0x3dad,
0x3dae, 0x3daf, 0x3db0, 0x3db1, 0x3db3, 0x3db4, 0x3db5, 0x3db6, 0x3db7, 0x3db9, 0x3db9, 0x3dbb,
0x3dbc, 0x3dbd, 0x3dbe, 0x3dbf, 0x3dc0, 0x3dc1, 0x3dc2, 0x3dc3, 0x3dc5, 0x3dc5, 0x3dc7, 0x3dc8,
0x3dc8, 0x3dca, 0x3dcb, 0x3dcc, 0x3dcd, 0x3dce, 0x3dcf, 0x3dd0, 0x3dd1, 0x3dd2, 0x3dd3, 0x3dd4,
0x3dd5, 0x3dd6, 0x3dd7, 0x3dd8, 0x3dd9, 0x3dda, 0x3ddb, 0x3ddb, 0x3ddd, 0x3dde, 0x3dde, 0x3ddf,
0x3de1, 0x3de1, 0x3de2, 0x3de3, 0x3de4, 0x3de5, 0x3de6, 0x3de7, 0x3de8, 0x3de8, 0x3dea, 0x3deb,
0x3deb, 0x3dec, 0x3ded, 0x3dee, 0x3def, 0x3def, 0x3df1, 0x3df2, 0x3df2, 0x3df3, 0x3df4, 0x3df5,
0x3df6, 0x3df7, 0x3df7, 0x3df8, 0x3df9, 0x3dfa, 0x3dfb, 0x3dfb, 0x3dfc, 0x3dfd, 0x3dfe, 0x3dff,
0x3e00, 0x3e00, 0x3e00, 0x3e01, 0x3e01, 0x3e02, 0x3e02, 0x3e03, 0x3e03, 0x3e03, 0x3e04, 0x3e04,
0x3e05, 0x3e05, 0x3e05, 0x3e06, 0x3e06, 0x3e07, 0x3e07, 0x3e07, 0x3e08, 0x3e08, 0x3e09, 0x3e09,
0x3e09, 0x3e0a, 0x3e0a, 0x3e0b, 0x3e0b, 0x3e0b, 0x3e0b, 0x3e0c, 0x3e0c, 0x3e0d, 0x3e0d, 0x3e0d,
0x3e0e, 0x3e0e, 0x3e0f, 0x3e0f, 0x3e10, 0x3e10, 0x3e10, 0x3e10, 0x3e11, 0x3e11, 0x3e11, 0x3e12,
0x3e12, 0x3e13, 0x3e13, 0x3e14, 0x3e14, 0x3e14, 0x3e14, 0x3e15, 0x3e15, 0x3e15, 0x3e16, 0x3e16,
0x3e17, 0x3e17, 0x3e17, 0x3e17, 0x3e18, 0x3e18, 0x3e19, 0x3e19, 0x3e19, 0x3e19, 0x3e1a, 0x3e1a,
0x3e1b, 0x3e1b, 0x3e1b, 0x3e1c, 0x3e1c, 0x3e1c, 0x3e1d, 0x3e1d, 0x3e1d, 0x3e1e, 0x3e1e, 0x3e1e,
0x3e1f, 0x3e1f, 0x3e1f, 0x3e20, 0x3e20, 0x3e20, 0x3e21, 0x3e21, 0x3e21, 0x3e22, 0x3e22, 0x3e22,
0x3e22, 0x3e23, 0x3e23, 0x3e24, 0x3e24, 0x3e24, 0x3e24, 0x3e25, 0x3e25, 0x3e26, 0x3e26, 0x3e26,
0x3e26, 0x3e27, 0x3e27, 0x3e27, 0x3e28, 0x3e28, 0x3e28, 0x3e29, 0x3e29, 0x3e29, 0x3e29, 0x3e2a,
0x3e2a, 0x3e2a, 0x3e2b, 0x3e2b, 0x3e2b, 0x3e2c, 0x3e2c, 0x3e2c, 0x3e2d, 0x3e2d, 0x3e2d, 0x3e2d,
0x3e2e, 0x3e2e, 0x3e2f, 0x3e2f, 0x3e2f, 0x3e2f, 0x3e30, 0x3e30, 0x3e30, 0x3e30, 0x3e31, 0x3e31,
0x3e31, 0x3e32, 0x3e32, 0x3e32, 0x3e33, 0x3e33, 0x3e33, 0x3e33, 0x3e34, 0x3e34, 0x3e34, 0x3e35,
0x3e35, 0x3e35, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e37, 0x3e37, 0x3e37, 0x3e38, 0x3e38,
0x3e39, 0x3e39, 0x3e39, 0x3e39, 0x3e39, 0x3e3a, 0x3e3a, 0x3e3b, 0x3e3b, 0x3e3b, 0x3e3b, 0x3e3b,
0x3e3c, 0x3e3c, 0x3e3c, 0x3e3d, 0x3e3d, 0x3e3d, 0x3e3d, 0x3e3d, 0x3e3e, 0x3e3e, 0x3e3f, 0x3e3f,
0x3e3f, 0x3e3f, 0x3e3f, 0x3e40, 0x3e40, 0x3e41, 0x3e41, 0x3e41, 0x3e41, 0x3e41, 0x3e42, 0x3e42,
0x3e42, 0x3e43, 0x3e43, 0x3e43, 0x3e43, 0x3e44, 0x3e44, 0x3e44, 0x3e45, 0x3e45, 0x3e45, 0x3e45,
0x3e45, 0x3e46, 0x3e46, 0x3e47, 0x3e47, 0x3e47, 0x3e47, 0x3e47, 0x3e48, 0x3e48, 0x3e48, 0x3e48,
0x3e48, 0x3e49, 0x3e49, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4b, 0x3e4b, 0x3e4b, 0x3e4c,
0x3e4c, 0x3e4c, 0x3e4c, 0x3e4c, 0x3e4d, 0x3e4d, 0x3e4e, 0x3e4e, 0x3e4e, 0x3e4e, 0x3e4e, 0x3e4f,
0x3e4f, 0x3e4f, 0x3e4f, 0x3e4f, 0x3e50, 0x3e50, 0x3e51, 0x3e51, 0x3e51, 0x3e51, 0x3e51, 0x3e52,
0x3e52, 0x3e52, 0x3e52, 0x3e52, 0x3e53, 0x3e53, 0x3e53, 0x3e54, 0x3e54, 0x3e54, 0x3e54, 0x3e55,
0x3e55, 0x3e55, 0x3e55, 0x3e55, 0x3e56, 0x3e56, 0x3e56, 0x3e57, 0x3e57, 0x3e57, 0x3e57, 0x3e57,
0x3e58, 0x3e58, 0x3e58, 0x3e58, 0x3e59, 0x3e59, 0x3e59, 0x3e5a, 0x3e5a, 0x3e5a, 0x3e5a, 0x3e5a,
0x3e5b, 0x3e5b, 0x3e5b, 0x3e5b, 0x3e5b, 0x3e5c, 0x3e5c, 0x3e5d, 0x3e5d, 0x3e5d, 0x3e5d, 0x3e5d,
0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5f, 0x3e5f, 0x3e5f, 0x3e5f, 0x3e5f, 0x3e60, 0x3e60,
0x3e61, 0x3e61, 0x3e61, 0x3e61, 0x3e61, 0x3e62, 0x3e62, 0x3e62, 0x3e62, 0x3e62, 0x3e63, 0x3e63,
0x3e63, 0x3e63, 0x3e63, 0x3e64, 0x3e64, 0x3e65, 0x3e65, 0x3e65, 0x3e65, 0x3e65, 0x3e66, 0x3e66,
0x3e66, 0x3e66, 0x3e66, 0x3e67, 0x3e67, 0x3e67, 0x3e67, 0x3e67, 0x3e68, 0x3e68, 0x3e68, 0x3e68,
0x3e68, 0x3e69, 0x3e69, 0x3e6a, 0x3e6a, 0x3e6a, 0x3e6a, 0x3e6a, 0x3e6b, 0x3e6b, 0x3e6b, 0x3e6b,
0x3e6b, 0x3e6c, 0x3e6c, 0x3e6c, 0x3e6c, 0x3e6c, 0x3e6d, 0x3e6d, 0x3e6d, 0x3e6d, 0x3e6d, 0x3e6e,
0x3e6e, 0x3e6e, 0x3e6e, 0x3e6e, 0x3e6f, 0x3e6f, 0x3e6f, 0x3e6f, 0x3e6f, 0x3e70, 0x3e70, 0x3e71,
0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e72, 0x3e72, 0x3e72, 0x3e72, 0x3e72, 0x3e73, 0x3e73, 0x3e73,
0x3e73, 0x3e73, 0x3e74, 0x3e74, 0x3e74, 0x3e74, 0x3e74, 0x3e75, 0x3e75, 0x3e75, 0x3e75, 0x3e76,
0x3e76, 0x3e76, 0x3e76, 0x3e76, 0x3e77, 0x3e77, 0x3e77, 0x3e77, 0x3e77, 0x3e78, 0x3e78, 0x3e78,
0x3e78, 0x3e78, 0x3e79, 0x3e79, 0x3e79, 0x3e79, 0x3e79, 0x3e7a, 0x3e7a, 0x3e7a, 0x3e7a, 0x3e7a,
0x3e7b, 0x3e7b, 0x3e7b, 0x3e7b, 0x3e7b, 0x3e7c, 0x3e7c, 0x3e7c, 0x3e7c, 0x3e7c, 0x3e7d, 0x3e7d,
0x3e7d, 0x3e7d, 0x3e7d, 0x3e7e, 0x3e7e, 0x3e7e, 0x3e7e, 0x3e7f, 0x3e7f, 0x3e7f, 0x3e7f, 0x3e7f,
0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e81, 0x3e81, 0x3e81,
0x3e81, 0x3e81, 0x3e81, 0x3e81, 0x3e81, 0x3e81, 0x3e81, 0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e82,
0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e83, 0x3e83, 0x3e83, 0x3e83, 0x3e83, 0x3e83, 0x3e83, 0x3e83,
0x3e83, 0x3e83, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84,
0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e86, 0x3e86,
0x3e86, 0x3e86, 0x3e86, 0x3e86, 0x3e86, 0x3e86, 0x3e86, 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e87,
0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e88, 0x3e88, 0x3e88, 0x3e88, 0x3e88, 0x3e88, 0x3e88,
0x3e88, 0x3e88, 0x3e88, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89,
0x3e89, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8b, 0x3e8b,
0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b,
0x3e8b, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8d,
0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8e, 0x3e8e, 0x3e8e,
0x3e8e, 0x3e8e, 0x3e8e, 0x3e8e, 0x3e8e, 0x3e8e, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f,
0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90,
0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e91,
0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e92, 0x3e92, 0x3e92, 0x3e92, 0x3e92, 0x3e92, 0x3e92,
0x3e92, 0x3e92, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93,
0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94,
0x3e94, 0x3e94, 0x3e94, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95,
0x3e95, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e97, 0x3e97,
0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97,
0x3e97, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e99,
0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99,
0x3e99, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9b,
0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9c, 0x3e9c, 0x3e9c,
0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c,
0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9e, 0x3e9e, 0x3e9e,
0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e,
0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3ea0, 0x3ea0,
0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0,
0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea2, 0x3ea2,
0x3ea2, 0x3ea2, 0x3ea2, 0x3ea2, 0x3ea2, 0x3ea3, 0x3ea3, 0x3ea4, 0x3ea4, 0x3ea4, 0x3ea5, 0x3ea5,
0x3ea6, 0x3ea6, 0x3ea6, 0x3ea7, 0x3ea7, 0x3ea7, 0x3ea8, 0x3ea8, 0x3ea9, 0x3ea9, 0x3ea9, 0x3eaa,
0x3eaa, 0x3eaa, 0x3eab, 0x3eab,
};
static void tl_lut_ref(uint16_t *ofmap, uint16_t *ifmap, cvk_tl_shape_t ifmap_shape) {
for (uint32_t i = 0; i < tl_shape_size(&ifmap_shape); i++) {
if (mode == PRE_DATA_COMPARE_FIX) {
ofmap[i] = test_pattern_ref[i];
} else {
ofmap[i] = convert_fp32_bf16(pow(convert_bf16_fp32(ifmap[i]), 0.5));
}
}
}
static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint16_t *ifmap,
uint64_t ifmap_shape_size, TEST_MODE mode) {
uint64_t size = ifmap_shape_size;
for (uint64_t i = 0; i < size; i++) {
bool is_close;
uint16_t ref;
uint16_t ofmap_data_bf16;
float ref_f;
float ofmap_data_f;
ref = ref_data[i];
ref_f = convert_bf16_fp32(ref);
ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
ofmap_data_bf16 = ofmap_data[i];
if (mode == PRE_DATA_COMPARE_FIX) {
is_close = ofmap_data[i] == ref;
} else {
is_close = fabs(ref_f - ofmap_data_f) < 0.001;
}
if (!is_close) {
fprintf(stderr,
"comparing failed at ofmap_data[%lu](input:%e), got %x, exp %x, "
"fp32: got %e exp %e\n",
i, convert_bf16_fp32(ifmap[i]), ofmap_data_bf16, ref, ofmap_data_f, ref_f);
exit(-1);
}
}
return true;
}
static void gen_input(uint16_t *ifmap, uint64_t ifmap_shape_size) {
if (mode == PRE_DATA_COMPARE_FIX) {
memcpy(ifmap, &test_pattern, sizeof(test_pattern));
} else {
for (uint64_t i = 0; i < ifmap_shape_size; i++) {
srand(static_cast<unsigned>(time(0)));
std::random_device rd;
std::mt19937 e2(rd());
float LO = pow(2, -10);
float HI = pow(2, 10);
// std::uniform_real_distribution<> dist(pow(2,-62), pow(2,63));
for (uint64_t i = 0; i < ifmap_shape_size; i++) {
// float r3 = dist(e2);
float r3 = LO + static_cast<float>(rand()) / (static_cast<float>(RAND_MAX / (HI - LO)));
ifmap[i] = convert_fp32_bf16(r3);
}
}
}
#ifdef DBG
for (uint64_t i = 0; i < ifmap_shape_size; i++) {
printf("source if[%lu] bf16 %f 0x%x, log2f is %f\n", i, convert_bf16_fp32(ifmap[i]), ifmap[i],
floor(log2((convert_bf16_fp32(ifmap[i])))));
}
#endif /* ifdef DBG */
}
static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk, uint32_t input_n, uint32_t input_c,
uint32_t input_h, uint32_t input_w) {
cvk_fmt_t fmt = CVK_FMT_BF16;
// TODO: check more shape / align
cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
cvk_tl_shape_t ofmap_shape = ifmap_shape;
cvk_tl_shape_t table_shape;
cvm_table_shape(bmk, &table_shape);
uint64_t ifmap_shape_size = tl_shape_size(&ifmap_shape);
uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
uint64_t table_size = tl_shape_size(&table_shape);
// prepare input data with size
int data_type_size = bytesize_of_fmt(fmt);
uint64_t ifmap_bytesize = ifmap_shape_size * data_type_size;
uint64_t ofmap_bytesize = ofmap_size * data_type_size;
uint64_t table_bytesize = table_size * data_type_size;
uint16_t *ifmap = (uint16_t *)xmalloc(ifmap_bytesize);
uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
uint16_t *table_data = (uint16_t *)xmalloc(table_bytesize);
uint16_t *table_data_mantissa = (uint16_t *)xmalloc(table_bytesize);
// alloc lmem
cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
cvk_tl_t *tl_buf = test_alloc_tl(bmk, tl_ifmap->shape, fmt, /*align*/ 1);
cvk_tl_t *cvk_tl_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
cvk_tl_t *cvk_tl_table_answer_mantissa = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
// generate testbench
gen_input(ifmap, ifmap_shape_size);
tl_lut_ref(ref_data, ifmap, ifmap_shape);
// prepare table
cvm_sqrt_tbl(table_data, table_data_mantissa, &table_shape);
// sys->lmem
test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)ifmap);
test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer, (uint8_t *)table_data);
test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer_mantissa, (uint8_t *)table_data_mantissa);
cvm_emit_sqrt(bmk, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_mantissa,
tl_ofmap_bf16);
// issue cmd
test_submit_comp(ctx, bmk);
// get output from lmem->sys
uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, tl_ofmap_bf16);
verify(ofmap_data, ref_data, ifmap, ifmap_shape_size, mode);
free_tl(bmk, cvk_tl_table_answer_mantissa);
free_tl(bmk, cvk_tl_table_answer);
free_tl(bmk, tl_buf);
free_tl(bmk, tl_ofmap_bf16);
free_tl(bmk, tl_ifmap);
free(ifmap);
free(ref_data);
free(ofmap_data);
free(table_data);
free(table_data_mantissa);
}
int main() {
CVI_RT_HANDLE ctx;
cvk_context_t *bmk;
int round_mode;
round_mode = set_store_feround();
test_init(&ctx, &bmk);
for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) {
mode = static_cast<TEST_MODE>(i);
printf("test mode %d...\n", mode);
int input_n = 1;
int input_c = 32;
int input_h = 1;
int input_w = 1;
if (mode == PRE_DATA_COMPARE_FIX) {
input_h = 4;
input_w = 8;
} else {
input_h = input_w = 16;
}
testbench(&ctx, bmk, input_n, input_c, input_h, input_w);
}
test_exit(&ctx, bmk);
restore_feround(round_mode);
return 0;
}

View File

@ -0,0 +1,383 @@
#ifndef _BM_NATIVE_REF_H_
#define _BM_NATIVE_REF_H_
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
typedef union {
uint32_t ival;
float fval;
} IF_VAL;
/*
* fp32 version
*/
int array_cmp_float(const char *const info, float *p_exp, float *p_got, int count, float delta);
int array_cmp_int(const char *const info, int *p_exp, int *p_got, int count);
/**
* @name calc_dilute_hw
* @brief calculate diluted dimention
* @ingroup libbmutils
*
* @param [in] h origin dimention
* @param [in] ins_h scaleing factor, 0 -> no scaling
* @param [in] ins_h_l compensation value after last value in each row
* @param [in] pad_h_b extra padding left ofr bottom
* @param [in] pad_h_t extra padding right or top
*
* @retval diluted value
*/
int calc_dilute_hw(int h, int ins_h, int ins_h_l, int pad_h_b, int pad_h_t);
/**
* @name calc_output_hw
* @brief calculate output dimention by kernel and stride size
* @ingroup libbmutils
*
* @param [in] hw origin dimention
* @param [in] kwh scaling factor, 0 -> no scaling
* @param [in] stride compensation value after last value in each row
*
* @retval output dimention
*/
int calc_output_hw(int hw, int khw, int stride);
/**
* @name fill_pad_fmap_fp32
* @brief fill padded feature map with unpadded map
* @ingroup libbmutils
*
* @param [in] before input array
* @param [out] pbefore output array reference, if NULL, alloc a new one
* @param [in] pad_val padding value
* @param [in] pad_l padding left size
* @param [in] pad_r padding right size
* @param [in] pad_t padding top size
* @param [in] pad_b padding bottom size
* @param [in] ins_h scaling factor h
* @param [in] ins_w scaling factor w
* @param [in] ins_h_last compensation value after last value in each row
* @param [in] ins_w_last compensation value after last value in each col
* @param [in] h_before origin height
* @param [in] w_before origin width
*
* @retval BM_SUCCESS success
* @retval BM_ERR_INVALID_ARGUMENT before or pafter is null pointer
* @retval BM_ERR_NOMEM can't alloc new output array
*/
int fill_pad_fmap_fp32(const float *before, float **after, float pad_value, int pad_t, int pad_b,
int pad_l, int pad_r, int ins_h, int ins_w, int ins_h_last, int ins_w_last,
int h_before, int w_before);
void native_md_scalar(float *a, float *b, float *r, int N, int C, int H, int W, int op,
bool result_add);
void native_conv_ref(const void *ifmap, void *ofmap, const void *weight, int input_n, int input_c,
int input_h, int input_w, int output_c, int output_h, int output_w, int groups,
int kh, int kw, int dilation_h, int dilation_w, int pad_h, int pad_w,
int stride_h, int stride_w, int flip, int using_bias, const void *bias,
int result_add);
void native_pooling_forward_max(const float *bottom_data, float *top_data, int *mask_data,
const int count, const int num, const int channels,
const int height, const int width, const int pooled_height,
const int pooled_width, const int kernel_h, const int kernel_w,
const int stride_h, const int stride_w, const int pad_h,
const int pad_w);
void native_pooling_forward_ave(const float *bottom_data, float *top_data, const int count,
const int num, const int channels, const int height,
const int width, const int pooled_height, const int pooled_width,
const int kernel_h, const int kernel_w, const int stride_h,
const int stride_w, const int pad_h, const int pad_w);
/*
* int8 vresion
*/
/**
* @name array_cmp_int8
* @brief compare the contect of p_exp and p_got and print the error index
* and value
* @ingroup libbmutils
*
* @param [in] info informataion string printed when encounter error
* @param [in] p_exp input array
* @param [in] p_got length of input array
* @param [in] len length of input array
* @retval 0 no error
* @retval -1 error occur
*/
int array_cmp_int8(const char *const info, const int8_t *p_exp, const int8_t *p_got, int count);
/**
* @name fill_pad_fmap_int8
* @brief fill padded feature map with unpadded map
* @ingroup libbmutils
*
* @param [in] before input array
* @param [out] pbefore output array reference, if NULL, alloc a new one
* @param [in] pad_val padding value
* @param [in] pad_l padding left size
* @param [in] pad_r padding right size
* @param [in] pad_t padding top size
* @param [in] pad_b padding bottom size
* @param [in] ins_h scaling factor h
* @param [in] ins_w scaling factor w
* @param [in] ins_h_last compensation value after last value in each row
* @param [in] ins_w_last compensation value after last value in each col
* @param [in] h_before origin height
* @param [in] w_before origin width
*
* @retval BM_SUCCESS success
* @retval BM_ERR_INVALID_ARGUMENT before or pafter is null pointer
* @retval BM_ERR_NOMEM can't alloc new output array
*/
int fill_pad_fmap_int8(const int8_t *before, int8_t **pafter, int pad_val, int pad_l, int pad_r,
int pad_t, int pad_b, int ins_h, int ins_w, int ins_h_last, int ins_w_last,
int h_before, int w_before);
int fill_pad_fmap_bf16(const unsigned short *before, unsigned short **pafter, int pad_val,
int pad_l, int pad_r, int pad_t, int pad_b, int ins_h, int ins_w,
int ins_h_last, int ins_w_last, int h_before, int w_before);
/**
* @name fill_int_with_int8
* @brief (int) pdest[i] = (int8_t)pdest[i] for each element
* @ingroup libbmutils
*
* @param [out] pdest output array
* @param [in] psrc input array
* @param [in] len length of input array
*/
void fill_int_with_int8(int *pdest, int8_t *psrc, int len);
/**
* @name fill_int_with_uint8
* @brief (int) pdest[i] = (int16_t)pdest[i] for each element
* @ingroup libbmutils
*
* @param [out] pdest output array
* @param [in] psrc input array
* @param [in] len length of input array
*/
void fill_int_with_uint8(int *pdest, uint8_t *psrc, int len);
/**
* @name fill_int_with_int16
* @brief (int) pdest[i] = (int16_t)pdest[i] for each element
* @ingroup libbmutils
*
* @param [out] pdest output array
* @param [in] psrc input array
* @param [in] len length of input array
*/
void fill_int_with_int16(int *pdest, int16_t *psrc, int len);
void native_md_scalar_int8(int8_t *a, int8_t *b, int8_t *r, int N, int C, int H, int W, int op,
bool result_add);
/**
* @name inner_product
* @brief inner product of two array
* @ingroup libbmutils
*
* @param [in] a input array 0
* @param [in] b input array 1
* @param [in] len length of a or b
* @param [out] c store the summation
*/
void inner_product(const int *a, const int *b, int len, int *c);
void inner_float_product(const float *a, const float *b, int len, float *c);
/**
* @name native_conv_int8
* @brief do convolution specific 8bit feature map
* @ingroup libbmutils
*
* @param [in] ifmap input array
* @param [in] weight weight data array
* @param [in] bias bias array if !NULL, add bias
* @param [out] ofmap lenght of input array
* @param [in] in input batch size
* @param [in] ic input channel size
* @param [in] ih input height
* @param [in] iw input width
* @param [in] oc output channle size
* @param [in] kh kernel height
* @param [in] kw kernel width
* @param [in] dh kernel dilute height factor
* @param [in] dw kernel dilute width factor
* @param [in] pad_h_t padding top size
* @param [in] pad_h_b padding bottom size
* @param [in] pad_w_l padding left size
* @param [in] pad_w_r padding right size
* @param [in] stride_h stride height
* @param [in] stride_w stride width
* @param [in] ins_h insert extra element for each i_fmap row
* @param [in] ins_w insert extra element for each i_fmap col
* @param [in] ins_h_last insert extra element for last i_fmap row
* @param [in] ins_w_last insert extra element for last i_fmap col
* @param [in] input_sign i_fmap data type. 0 => signed, 1 => unsigned
* @param [in] r_shift_width scale bit for saturation
*
* @retval BM_SUCCESS success
* @retval other saturation failed
*/
int native_conv_int8(const int8_t *ifmap, const int8_t *weight, const int16_t *bias, int8_t *ofmap,
int in, int ic, int ih, int iw, int oc, int kh, int kw, int dh, int dw,
int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h, int stride_w,
int ins_h, int ins_w, int ins_h_last, int ins_w_last, int input_sign,
int r_shift_width, int do_relu);
/**
* @name native_fc_int8
* @brief do full-connected layer for specific feature map
* @ingroup libbmutils
*
* @param [in] L input array
* @param [in] R weight array
* @param [in] B bias array if !NULL, add bias
* @param [in] Y accumulation array if !NULL, add this
* @param [out] Y_ref output array
* @param [in] L_row_num input row size
* @param [in] L_col_num input col size
* @param [in] R_col_num weight
* @param [in] L_sign padding top size
* @param [in] R_sign padding top size
* @param [in] B_sign padding top size
* @param [in] L_shift_width padding top size
* @param [in] R_shift_width padding top size
* @param [in] is_result_int8 padding top size
* @param [in] do_relu padding top size
*
* @retval BM_SUCCESS success
* @retval other saturation failed
*/
int native_fc_int8(const int8_t *L, const int8_t *R, const int16_t *B, const int16_t *Y, int *Y_ref,
int L_row_num, int L_col_num, int R_col_num, int L_sign, int R_sign, int B_sign,
int l_shift_width, int r_shift_width, int is_result_int8, int do_relu);
/**
* @name native_pooling_ave_int8
* @brief do average pooling for specific feature map
* @ingroup libbmutils
*
* @param [in] i_fmap input array
* @param [in] weight weight data array
* @param [in] bias bias array if !NULL, add bias
* @param [out] o_fmap lenght of input array
* @param [in] pad_h_t padding top size
* @param [in] pad_h_b padding bottom size
* @param [in] pad_w_l padding left size
* @param [in] pad_w_r padding right size
* @param [in] stride_h stride height
* @param [in] stride_w stride width
* @param [in] ins_h insert extra element for each i_fmap row
* @param [in] ins_w insert extra element for each i_fmap col
* @param [in] ins_h_last insert extra element for last i_fmap row
* @param [in] ins_w_last insert extra element for last i_fmap col
* @param [in] input_sign i_fmap data type. 0 => signed, 1 => unsigned
* @param [in] satu_sign saturation data type. 0 => unsigned, 1 => signed
* @param [in] r_shift_width scale bit for saturation
* @param [in] const_weight if weight array has one uint8_t value
*
* @retval BM_SUCCESS success
* @retval BM_ERR_INVALID_ARGUMENT illegal kh/kw or r_shift_width
*/
int native_pooling_ave_int8(const int8_t *i_fmap, const void *weight, const int16_t *bias,
int8_t *o_fmap, int input_n, int input_c, int input_h, int input_w,
int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
int stride_h, int stride_w, int ins_w, int ins_h, int ins_w_last,
int ins_h_last, int input_sign, int satu_sign, int r_shift_width,
int const_weight);
/**
* @name native_pooling_max_int8
* @brief do max pooling for specific feature map
* @ingroup libbmutils
*
* @param [in] i_fmap input array
* @param [out] o_fmap lenght of input array
* @param [in] pad_h_t padding top size
* @param [in] pad_h_b padding bottom size
* @param [in] pad_w_l padding left size
* @param [in] pad_w_r padding right size
* @param [in] stride_h stride height
* @param [in] stride_w stride width
* @param [in] ins_h insert extra element for each i_fmap row
* @param [in] ins_w insert extra element for each i_fmap col
* @param [in] ins_h_last insert extra element for last i_fmap row
* @param [in] ins_w_last insert extra element for last i_fmap col
* @param [in] input_sign i_fmap data type. 0 => unsigned, 1 => signed
*
* @retval BM_SUCCESS success
* @retval BM_ERR_INVALID_ARGUMENT illegal ins_h/w or ins_[hw]_last
*/
int native_pooling_max_int8(const int8_t *i_fmap, int8_t *o_fmap, int input_n, int input_c,
int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b,
int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h,
int ins_w, int ins_h_last, int ins_w_last, int input_sign);
int native_pooling_max_fp32(const float *i_fmap, float *o_fmap, int input_n, int input_c,
int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b,
int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h,
int ins_w, int ins_h_last, int ins_w_last);
int native_pooling_avg_fp32(const float *i_fmap, float *o_fmap, int input_n, int input_c,
int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b,
int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h,
int ins_w, int ins_h_last, int ins_w_last, float avg_pooling_const);
int native_depthwise_fp32(const float *ifmap, const float *weight, const float *bias, float *ofmap,
int in, int ic, int ih, int iw, int kh, int kw, int dh, int dw,
int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h,
int stride_w, int ins_h, int ins_w, int ins_h_last, int ins_w_last);
/**
* @name satu_2_8bit
* @brief saturate each signed or unsiged 8bit element in array
* @ingroup libbmutils
*
* @param [in] pBuff input array
* @param [in] len lenght of input array
* @param [out] pyByteOut output array
* @param [in] rshiftbits right shift bit if round_floor && value != 0
* @param [in] round_floor enable floor rounding
* @param [in] sign_unsign 0 => unsigned, 1 => signed
*
* @retval BM_SUCCESS success
* @retval BM_ERR_INVALID_ARGUMENT rshiftbits < 0
*/
int satu_2_8bit(const int *pBuff, int len, int8_t *pByteOut, int rshiftbits, int round_floor,
int sign_unsign);
/**
* @name satu_2_16bit
* @brief saturate each signed or unsiged 16bit element in array
* @ingroup libbmutils
*
* @param [in] pBuff input array
* @param [in] len lenght of input array
* @param [out] pyByteOut output array
* @param [in] rshiftbits right shift bit if round_floor && value != 0
* @param [in] round_floor enable floor rounding
* @param [in] sign_unsign 0 => unsigned, 1 => signed
*
* @retval BM_SUCCESS success
* @retval BM_ERR_INVALID_ARGUMENT rshiftbits < 0
*/
int satu_2_16bit(const int *pBuff, int len, short *pByteOut, int rshiftbits, int round_floor,
int sign_unsign);
#ifdef __cplusplus
}
#endif
#endif /* _BM_NATIVE_REF_H_ */

View File

@ -0,0 +1,41 @@
#ifndef TEST_TF_QUANT_UTIL_H
#define TEST_TF_QUANT_UTIL_H
#include <stdint.h>
#define MAX(a, b) \
({ \
__typeof__(a) _a = (a); \
__typeof__(b) _b = (b); \
_a > _b ? _a : _b; \
})
#define MIN(a, b) \
({ \
__typeof__(a) _a = (a); \
__typeof__(b) _b = (b); \
_a > _b ? _b : _a; \
})
#ifdef __cplusplus
extern "C" {
#endif
int32_t RoundingDivideByPOT(int32_t x, int exponent);
int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b);
int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int rshift);
void QuantizeMultiplierSmallerThanOne(float real_multiplier, uint32_t *quantized_multiplier,
int *right_shift);
void pack_chl_quan_param(uint32_t channels, int has_bias, int32_t *bias, uint32_t *multiplier,
int8_t *rshift, uint8_t *packed_data);
// 1880v2: 5bit right shift, [0, 31]
// 1822: 1bit sign, 5b shift, [-32, 31]
int8_t truncate_rshift(int8_t rshift, int8_t allow_lshift);
#ifdef __cplusplus
}
#endif
#endif // TEST_TF_QUANT_UTIL_H

View File

@ -0,0 +1,52 @@
include(CMakeForceCompiler)
# usage
# cmake -DCMAKE_TOOLCHAIN_FILE=../toolchain-arm-linux.cmake ../
# The Generic system name is used for embedded targets (targets without OS) in
# CMake
set( CMAKE_SYSTEM_NAME Linux )
set( CMAKE_SYSTEM_PROCESSOR aarch64 )
# Set a toolchain path. You only need to set this if the toolchain isn't in
# your system path. Don't forget a trailing path separator!
set(TOOLCHAIN_TOPDIR "${TOOLCHAIN_ROOT_DIR}")
set( TC_PATH "${TOOLCHAIN_ROOT_DIR}/bin/" )
# The toolchain prefix for all toolchain executables
set( CROSS_COMPILE aarch64-linux-gnu- )
set( ARCH arm64 )
# specify the cross compiler. We force the compiler so that CMake doesn't
# attempt to build a simple test program as this will fail without us using
# the -nostartfiles option on the command line
set(CMAKE_C_COMPILER ${TC_PATH}${CROSS_COMPILE}gcc)
set(CMAKE_CXX_COMPILER ${TC_PATH}${CROSS_COMPILE}g++)
# To build the tests, we need to set where the target environment containing
# the required library is. On Debian-like systems, this is
# /usr/aarch64-linux-gnu.
SET(CMAKE_FIND_ROOT_PATH $ENV{TOOLCHAIN_TOPDIR})
# search for programs in the build host directories
SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
# for libraries and headers in the target directories
SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
# We must set the OBJCOPY setting into cache so that it's available to the
# whole project. Otherwise, this does not get set into the CACHE and therefore
# the build doesn't know what the OBJCOPY filepath is
set( CMAKE_OBJCOPY ${TC_PATH}${CROSS_COMPILE}objcopy
CACHE FILEPATH "The toolchain objcopy command " FORCE )
# Set the CMAKE C flags (which should also be used by the assembler!
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Os -std=gnu11" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdata-sections" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-pointer-to-int-cast" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsigned-char" )
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsigned-char" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
set( CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )

View File

@ -0,0 +1,57 @@
include(CMakeForceCompiler)
# usage
# cmake -DCMAKE_TOOLCHAIN_FILE=../toolchain-arm-linux.cmake ../
# The Generic system name is used for embedded targets (targets without OS) in
# CMake
set( CMAKE_SYSTEM_NAME Linux )
set( CMAKE_SYSTEM_PROCESSOR arm )
# Set a toolchain path. You only need to set this if the toolchain isn't in
# your system path. Don't forget a trailing path separator!
set(TOOLCHAIN_TOPDIR "${TOOLCHAIN_ROOT_DIR}")
set( TC_PATH "${TOOLCHAIN_ROOT_DIR}/bin/" )
# The toolchain prefix for all toolchain executables
set( CROSS_COMPILE arm-linux-gnueabihf- )
set( ARCH arm )
# specify the cross compiler. We force the compiler so that CMake doesn't
# attempt to build a simple test program as this will fail without us using
# the -nostartfiles option on the command line
set(CMAKE_C_COMPILER ${TC_PATH}${CROSS_COMPILE}gcc)
set(CMAKE_CXX_COMPILER ${TC_PATH}${CROSS_COMPILE}g++)
# To build the tests, we need to set where the target environment containing
# the required library is. On Debian-like systems, this is
# /usr/aarch64-linux-gnu.
SET(CMAKE_FIND_ROOT_PATH $ENV{TOOLCHAIN_TOPDIR})
# search for programs in the build host directories
SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
# for libraries and headers in the target directories
SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
# We must set the OBJCOPY setting into cache so that it's available to the
# whole project. Otherwise, this does not get set into the CACHE and therefore
# the build doesn't know what the OBJCOPY filepath is
set( CMAKE_OBJCOPY ${TC_PATH}${CROSS_COMPILE}objcopy
CACHE FILEPATH "The toolchain objcopy command " FORCE )
# Set the CMAKE C flags (which should also be used by the assembler!
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Os -std=gnu11" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdata-sections" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-pointer-to-int-cast" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsigned-char" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfloat-abi=hard" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon-vfpv4" )
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsigned-char" )
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a" )
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=hard" )
set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon-vfpv4" )
set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
set( CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )