From 83dc4914fef1fdae9f7d49e95fc0a4d8765eadcc Mon Sep 17 00:00:00 2001 From: carbon Date: Fri, 31 May 2024 11:54:07 +0800 Subject: [PATCH] add cvimath commit ce8705f49da5e5f59c2ddb3253ef88323a0cd9c4 Author: sophgo-forum-service Date: Mon May 13 14:04:10 2024 +0800 [feat] cvimath opensource for cv18xx soc. - 9e8967 --- .version/2024-05-31.md | 1 + cvimath/.clang-format | 108 ++ cvimath/.gitignore | 5 + cvimath/CMakeLists.txt | 85 + cvimath/README.md | 21 + cvimath/clang-format.sh | 8 + cvimath/include/cvimath.h | 84 + cvimath/include/cvimath_internal.h | 1066 +++++++++++++ cvimath/include/test_cvikernel_util.h | 393 +++++ cvimath/sample/CMakeLists.txt | 28 + cvimath/sample/README.md | 21 + cvimath/sample/sample_bf16_fp32.cpp | 130 ++ cvimath/sample/sample_fp32_bf16.cpp | 109 ++ cvimath/sample/sample_gemm.cpp | 312 ++++ cvimath/sample/sample_mask.cpp | 175 +++ cvimath/sample/sample_reduce_mul.cpp | 160 ++ cvimath/sample/sample_set_val_by_mask.cpp | 656 ++++++++ .../sample/sample_sigmoid_linear_interp.cpp | 165 ++ cvimath/sample/sample_upsample.cpp | 145 ++ cvimath/src/1880v2_fp_convert.c | 293 ++++ cvimath/src/CMakeLists.txt | 12 + cvimath/src/bf16_gemm.c | 1361 +++++++++++++++++ cvimath/src/blas_cpu.cpp | 82 + cvimath/src/chl_quan.cpp | 118 ++ cvimath/src/common.c | 1032 +++++++++++++ cvimath/src/fp32_bf16_kernel.c | 138 ++ cvimath/src/gen_lut.h | 207 +++ cvimath/src/set_val_by_mask.c | 1169 ++++++++++++++ cvimath/src/tiu_lut_atan.c | 1106 ++++++++++++++ cvimath/src/tiu_lut_atan2.c | 787 ++++++++++ cvimath/src/tiu_reciprocal.c | 149 ++ cvimath/src/tiu_reshape_c.c | 387 +++++ cvimath/src/tiu_sigmoid.c | 266 ++++ cvimath/src/tiu_sqrt.c | 121 ++ cvimath/src/tiu_upsample.c | 54 + cvimath/src/util.c | 270 ++++ cvimath/tests/CMakeLists.txt | 34 + cvimath/tests/common/test_native_ref.c | 980 ++++++++++++ cvimath/tests/cvi1835/atan.cpp | 477 ++++++ cvimath/tests/cvi1835/atan2_degree.cpp | 667 ++++++++ cvimath/tests/cvi1835/atan2_radian.cpp | 719 +++++++++ cvimath/tests/cvi1835/bf16_fp32.cpp | 148 ++ cvimath/tests/cvi1835/blas_cpu.cpp | 60 + cvimath/tests/cvi1835/blas_tpu.cpp | 134 ++ .../tests/cvi1835/depthwise_reshape_same.cpp | 907 +++++++++++ cvimath/tests/cvi1835/fp32_bf16.cpp | 127 ++ cvimath/tests/cvi1835/gemm.cpp | 845 ++++++++++ cvimath/tests/cvi1835/mask.cpp | 158 ++ cvimath/tests/cvi1835/reciprocal.cpp | 376 +++++ .../tests/cvi1835/sigmoid_linear_interp.cpp | 907 +++++++++++ cvimath/tests/cvi1835/sqrt.cpp | 375 +++++ cvimath/tests/include/test_native_ref.h | 383 +++++ cvimath/tests/include/test_tf_quant_util.h | 41 + .../toolchain/toolchain-aarch64-linux.cmake | 52 + .../toolchain/toolchain-gnueabihf-linux.cmake | 57 + 55 files changed, 18671 insertions(+) create mode 100644 cvimath/.clang-format create mode 100644 cvimath/.gitignore create mode 100644 cvimath/CMakeLists.txt create mode 100644 cvimath/README.md create mode 100755 cvimath/clang-format.sh create mode 100644 cvimath/include/cvimath.h create mode 100644 cvimath/include/cvimath_internal.h create mode 100644 cvimath/include/test_cvikernel_util.h create mode 100644 cvimath/sample/CMakeLists.txt create mode 100644 cvimath/sample/README.md create mode 100644 cvimath/sample/sample_bf16_fp32.cpp create mode 100644 cvimath/sample/sample_fp32_bf16.cpp create mode 100644 cvimath/sample/sample_gemm.cpp create mode 100644 cvimath/sample/sample_mask.cpp create mode 100644 cvimath/sample/sample_reduce_mul.cpp create mode 100644 cvimath/sample/sample_set_val_by_mask.cpp create mode 100644 cvimath/sample/sample_sigmoid_linear_interp.cpp create mode 100644 cvimath/sample/sample_upsample.cpp create mode 100644 cvimath/src/1880v2_fp_convert.c create mode 100644 cvimath/src/CMakeLists.txt create mode 100644 cvimath/src/bf16_gemm.c create mode 100644 cvimath/src/blas_cpu.cpp create mode 100644 cvimath/src/chl_quan.cpp create mode 100644 cvimath/src/common.c create mode 100644 cvimath/src/fp32_bf16_kernel.c create mode 100644 cvimath/src/gen_lut.h create mode 100644 cvimath/src/set_val_by_mask.c create mode 100644 cvimath/src/tiu_lut_atan.c create mode 100644 cvimath/src/tiu_lut_atan2.c create mode 100644 cvimath/src/tiu_reciprocal.c create mode 100644 cvimath/src/tiu_reshape_c.c create mode 100644 cvimath/src/tiu_sigmoid.c create mode 100644 cvimath/src/tiu_sqrt.c create mode 100644 cvimath/src/tiu_upsample.c create mode 100644 cvimath/src/util.c create mode 100644 cvimath/tests/CMakeLists.txt create mode 100644 cvimath/tests/common/test_native_ref.c create mode 100644 cvimath/tests/cvi1835/atan.cpp create mode 100644 cvimath/tests/cvi1835/atan2_degree.cpp create mode 100644 cvimath/tests/cvi1835/atan2_radian.cpp create mode 100644 cvimath/tests/cvi1835/bf16_fp32.cpp create mode 100644 cvimath/tests/cvi1835/blas_cpu.cpp create mode 100644 cvimath/tests/cvi1835/blas_tpu.cpp create mode 100644 cvimath/tests/cvi1835/depthwise_reshape_same.cpp create mode 100644 cvimath/tests/cvi1835/fp32_bf16.cpp create mode 100644 cvimath/tests/cvi1835/gemm.cpp create mode 100644 cvimath/tests/cvi1835/mask.cpp create mode 100644 cvimath/tests/cvi1835/reciprocal.cpp create mode 100644 cvimath/tests/cvi1835/sigmoid_linear_interp.cpp create mode 100644 cvimath/tests/cvi1835/sqrt.cpp create mode 100644 cvimath/tests/include/test_native_ref.h create mode 100644 cvimath/tests/include/test_tf_quant_util.h create mode 100644 cvimath/toolchain/toolchain-aarch64-linux.cmake create mode 100644 cvimath/toolchain/toolchain-gnueabihf-linux.cmake diff --git a/.version/2024-05-31.md b/.version/2024-05-31.md index 80b66efde..ec79e0332 100644 --- a/.version/2024-05-31.md +++ b/.version/2024-05-31.md @@ -19,3 +19,4 @@ | cvibuilder | cvibuilder | https://github.com/sophgo/cvibuilder.git | sg200x-dev | 4309f2a | | cvikernel | cvikernel | https://github.com/sophgo/cvikernel.git | sg200x-dev | 9f1f57a | | cviruntime | cviruntime | https://github.com/sophgo/cviruntime.git | sg200x-dev | 3f49386 | +| cvimath | cvimath | https://github.com/sophgo/cvimath.git | sg200x-dev | ce8705f | diff --git a/cvimath/.clang-format b/cvimath/.clang-format new file mode 100644 index 000000000..b64279812 --- /dev/null +++ b/cvimath/.clang-format @@ -0,0 +1,108 @@ +--- +Language: Cpp +# BasedOnStyle: Google +AccessModifierOffset: -1 +AlignAfterOpenBracket: Align +AlignConsecutiveAssignments: false +AlignConsecutiveDeclarations: false +AlignEscapedNewlines: Left +AlignOperands: true +AlignTrailingComments: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortBlocksOnASingleLine: false +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortIfStatementsOnASingleLine: true +AllowShortLoopsOnASingleLine: true +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: true +AlwaysBreakTemplateDeclarations: true +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeBraces: Attach +BreakBeforeInheritanceComma: false +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 100 +CommentPragmas: '^ IWYU pragma:' +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: true +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DerivePointerAlignment: true +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeCategories: + - Regex: '^<.*\.h>' + Priority: 1 + - Regex: '^<.*' + Priority: 2 + - Regex: '.*' + Priority: 3 +IncludeIsMainRegex: '([-_](test|unittest))?$' +IndentCaseLabels: true +IndentWidth: 2 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: false +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: false +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 1 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 200 +PointerAlignment: Left +ReflowComments: true +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeParens: ControlStatements +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 2 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Auto +TabWidth: 8 +UseTab: Never +... + diff --git a/cvimath/.gitignore b/cvimath/.gitignore new file mode 100644 index 000000000..cf56d02fa --- /dev/null +++ b/cvimath/.gitignore @@ -0,0 +1,5 @@ +.vscode +build +install + + diff --git a/cvimath/CMakeLists.txt b/cvimath/CMakeLists.txt new file mode 100644 index 000000000..3422abee7 --- /dev/null +++ b/cvimath/CMakeLists.txt @@ -0,0 +1,85 @@ +project(cvimath) + +cmake_minimum_required(VERSION 3.2.2) + +set(CMAKE_CXX_STANDARD 11) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CXX_EXTENSIONS ON) +#set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE) +#set(CMAKE_INSTALL_RPATH "\${ORIGIN}/../lib;\${ORIGIN}/") + +if ("${CMAKE_BUILD_TYPE}" STREQUAL "") + set(CMAKE_BUILD_TYPE "Release") +endif() + +if("${CMAKE_TOOLCHAIN_FILE}" STREQUAL "") + message("No toolchain file found. Using host compiler.") + if ("${CMAKE_INSTALL_PREFIX}" STREQUAL "/usr/local") + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/install") + endif() +else() + if ("${CMAKE_INSTALL_PREFIX}" STREQUAL "/usr/local") + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/install_soc") + endif() +endif() + +set(CMAKE_C_INIT "-fsigned-char -fPIC -Werror=all -fdiagnostics-color=always") +set(CMAKE_CXX_INIT "-fsigned-char -fPIC -Werror=all -fdiagnostics-color=always -std=gnu++11") +if("${CMAKE_BUILD_TYPE}" STREQUAL "Release" OR "${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE") + set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_INIT} -O3" ) + set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_INIT} -O3" ) +elseif("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") + set( SAFETY_FLAGS "-Werror -Wall -Wextra -ggdb -fno-strict-aliasing") + set( SAFETY_FLAGS "${SAFETY_FLAGS} -fsanitize=address") + set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_INIT} -g -O0 ${SAFETY_FLAGS}") + set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_INIT} -g -O0 ${SAFETY_FLAGS}" ) +else() + message(FATAL_ERROR "No build type!!!") +endif() + +message("==================================================") +message("[Summary]") +message("C compiler ${CMAKE_C_COMPILER}") +message("CXX compiler ${CMAKE_CXX_COMPILER}") +message("Build type ${CMAKE_BUILD_TYPE}") +message("Install dir ${CMAKE_INSTALL_PREFIX}") +message("==================================================") + +# Add externel libs +set( TPU_LD "-L${TPU_SDK_ROOT}/lib") +set( TPU_KERNEL_LIB "${TPU_LD} -lcvikernel") +# wait cvimath/cviruntime so are generated +set( TEST_LIBS cvimath cviruntime) + +# Add include path and set tpu libraries. +include_directories( + ${TPU_SDK_ROOT}/include + ${CVI_EXTRA}/include + "${CMAKE_CURRENT_SOURCE_DIR}/include") + +# https://stackoverflow.com/questions/30250494/ctest-not-detecting-tests +enable_testing() + +# ctest config +if (NOT CMAKE_CROSSCOMPILING) + if (ENABLE_TEST STREQUAL "ON") + add_subdirectory(tests) + endif() +endif() + +add_subdirectory(src) +add_subdirectory(sample) + +# export header +file(GLOB HEADERS + include/cvimath.h + include/cvimath_internal.h + include/test_cvikernel_util.h + ) + +# export sample +#file(GLOB SAMPLES sample/*) + +#install(FILES ${SAMPLES} DESTINATION samples/cvimath) +install(FILES ${CMAKE_SOURCE_DIR}/toolchain/toolchain-aarch64-linux.cmake DESTINATION samples/cvimath) +install(FILES ${HEADERS} DESTINATION include/cvimath) diff --git a/cvimath/README.md b/cvimath/README.md new file mode 100644 index 000000000..7f9f3bd5c --- /dev/null +++ b/cvimath/README.md @@ -0,0 +1,21 @@ +# CviMath + +## How to build + +### Requirements + +1. MLIR SDK + +SOC mode + +``` +$ mkdir build +$ cd build +$ cmake -G Ninja .. -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \ + -DTOOLCHAIN_ROOT_DIR=${PWD}/../../gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu \ + -DCMAKE_TOOLCHAIN_FILE=${PWD}/../toolchain/toolchain-aarch64-linux.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX= \ + -DTPU_SDK_ROOT= +$ ninja -j8 && ninja install +``` diff --git a/cvimath/clang-format.sh b/cvimath/clang-format.sh new file mode 100755 index 000000000..9190a8135 --- /dev/null +++ b/cvimath/clang-format.sh @@ -0,0 +1,8 @@ +#!/bin/bash +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +CLANG_ROOT=$(readlink -f $SCRIPT_DIR) + +find $CLANG_ROOT/include -regex '.*\.\(cpp\|h\|hpp\|cc\|c\|cxx\|inc\)' -exec clang-format -i {} \; +find $CLANG_ROOT/src -regex '.*\.\(cpp\|h\|hpp\|cc\|c\|cxx\|inc\)' -exec clang-format -i {} \; +find $CLANG_ROOT/tests -regex '.*\.\(cpp\|h\|hpp\|cc\|c\|cxx\|inc\)' -exec clang-format -i {} \; +find $CLANG_ROOT/sample -regex '.*\.\(cpp\|h\|hpp\|cc\|c\|cxx\|inc\)' -exec clang-format -i {} \; diff --git a/cvimath/include/cvimath.h b/cvimath/include/cvimath.h new file mode 100644 index 000000000..1d2d1bf8e --- /dev/null +++ b/cvimath/include/cvimath.h @@ -0,0 +1,84 @@ +#ifndef CVIMATH_H +#define CVIMATH_H + +#include + +// public function +#ifdef __cplusplus +extern "C" { +#endif + +/** + * @brief This function calculated the unit length of a precahed i8 feature array + * + * @param precached Prefetched feature array in 1-D. Format: feature1, feature2, ... + * @param unit_precached_arr Output unit length. + * @param data_length The length of the feature. + * @param data_num The number of features. + */ +void cvm_gen_precached_i8_unit_length(int8_t *precached, float *unit_precached_arr, + const uint32_t data_length, const uint32_t data_num); + +/** + * @brief This function calculated the unit length of a precahed u8 feature array + * + * @param precached Prefetched feature array in 1-D. Format: feature1, feature2, ... + * @param unit_precached_arr Output unit length. + * @param data_length The length of the feature. + * @param data_num The number of features. + */ +void cvm_gen_precached_u8_unit_length(uint8_t *precached, float *unit_precached_arr, + const uint32_t data_length, const uint32_t data_num); + +/** + * @brief Do inner product matching on i8 feature with given precached feature array. + * + * @param feature The input i8 feature to be compared. + * @param precached The precached feature array in 1-D. + * @param unit_precached_arr The unit length array of the precached. + * @param k_index The output matching index result in order. + * @param k_value The output matching value result in order. + * @param buffer The buffer used by this function, same length as precached. + * @param data_length The length of the single feature. + * @param data_num The number of features of the feature array. + * @param k Top k results, affects the length of k_index and k_value. + */ +void cvm_cpu_i8data_ip_match(int8_t *feature, int8_t *precached, float *unit_precached_arr, + uint32_t *k_index, float *k_value, float *buffer, + const uint32_t data_length, const uint32_t data_num, const uint32_t k); + +/** + * @brief Do inner product matching on u8 feature with given precached feature array. + * + * @param feature The input u8 feature to be compared. + * @param precached The precached feature array in 1-D. + * @param unit_precached_arr The unit length array of the precached. + * @param k_index The output matching index result in order. + * @param k_value The output matching value result in order. + * @param buffer The buffer used by this function, same length as precached. + * @param data_length The length of the single feature. + * @param data_num The number of features of the feature array. + * @param k Top k results, affects the length of k_index and k_value. + */ +void cvm_cpu_u8data_ip_match(uint8_t *feature, uint8_t *precached, float *unit_precached_arr, + uint32_t *k_index, float *k_value, float *buffer, + const uint32_t data_length, const uint32_t data_num, const uint32_t k); + +// Legacy support for hj. +inline void __attribute__((always_inline)) +cvm_gen_db_i8_unit_length(int8_t *precached, float *unit_precached_arr, const uint32_t data_length, + const uint32_t data_num) { + cvm_gen_precached_i8_unit_length(precached, unit_precached_arr, data_length, data_num); +} + +inline void __attribute__((always_inline)) +cvm_gen_db_unit_length(uint8_t *precached, float *unit_precached_arr, const uint32_t data_length, + const uint32_t data_num) { + cvm_gen_precached_u8_unit_length(precached, unit_precached_arr, data_length, data_num); +} + +#ifdef __cplusplus +} +#endif + +#endif // CVIMATH_H diff --git a/cvimath/include/cvimath_internal.h b/cvimath/include/cvimath_internal.h new file mode 100644 index 000000000..555b6067b --- /dev/null +++ b/cvimath/include/cvimath_internal.h @@ -0,0 +1,1066 @@ +#ifndef CVIMATH_INTERNAL_H +#define CVIMATH_INTERNAL_H + +#include //bool +#include //size_t +#include "cvimath.h" + +// copy from lagency +// TODO: move to properly header files +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; +typedef uint64_t u64; + +typedef int8_t s8; +typedef int16_t s16; +typedef int32_t s32; +typedef int64_t s64; + +static inline uint64_t align_up(uint64_t x, uint64_t n) { return (x + n - 1) / n * n; } + +/** + * please refer @example for more details + */ +#include + +#define CVK_MULTIPLIER_BIAS_PACKED_DATA_SIZE 9 +#define CVK_MULTIPLIER_ONLY_PACKED_DATA_SIZE 5 + +// public function +#ifdef __cplusplus +extern "C" { +#endif +/** + * @brief get lookup tabel shape + * + * @param cvk_ctx kernel structure + * @param [out] shape the table shape + */ +void cvm_table_shape(cvk_context_t *cvk_ctx, cvk_tl_shape_t *shape); + +/** + * @brief generate sqrt look up table for bf16 exponent part + * + * @param [out] table_data bf16 exponent part lookup table in host + * @param table_shape table shape + */ +void cvm_gen_sqrt(uint16_t *table_data, cvk_tl_shape_t *table_shape); + +/** + * @brief syntactic sugar for cvm_gen_sqrt/cvm_gen_sqrt_mantissa + * + * @param [out] sqrt_table_data bf16 exponent part lookup table in host + * @param [out] sqrt_table_data_mantissa bf16 fraction part lookup table in host + * @param table_shape table shape + */ +void cvm_sqrt_tbl(uint16_t *sqrt_table_data, uint16_t *sqrt_table_data_mantissa, + cvk_tl_shape_t *table_shape); + +/** + * @brief generate sqrt look up table for bf16 fraction part + * + * @param [out] table_mantissa bf16 fraction part lookup table in host + * @param table_shape table shape + */ +void cvm_gen_sqrt_mantissa(uint16_t *table_mantissa, cvk_tl_shape_t *table_shape); + +/** + * @brief implement sqrt in tpu memory + * + * @param cvk_ctx kernel structure + * @param tl_ifmap input tensor in tpu memory + * @param tl_buf working buffer + * @param tbl_answer lookup table tensor for bf16 exponent part in tpu memory + * @param tbl_answer_mantissa lookup table tensor for fraction part in tpu memory + * @param [out] tl_ofmap_bf16 result in in memory + * + * @example + * // 1. alloc in tpu memory + * // 2. prepare table + * cvm_sqrt_tbl(table_data, table_data_mantissa, &table_shape); + * // 3. put host data to tpu memory + * // 4. prepare command buffer + * cvm_emit_sqrt(cvk_ctx, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_mantissa, + * tl_ofmap_bf16); + * // 5. submit it + * test_submit_comp(rt_ctx, cvk_ctx); + * + * // 6. get result from tpu memory + * uint16_t* result = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16, + * tl_ofmap_bf16->fmt); + * + * @return status, 0 means success, other means generates command fail + */ +int cvm_emit_sqrt(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa, cvk_tl_t *tl_ofmap_bf16); + +/** + * @brief generate reciprocal look up table for bf16 exponent part + * + * @param [out] table_data bf16 exponent part lookup table in host + * @param table_shape table shape + */ +void cvm_gen_reciprocal(uint16_t *table_data, cvk_tl_shape_t *table_shape); + +/** + * @brief generate reciprocal look up table for bf16 fraction part + * + * @param [out] table_mantissa bf16 fraction part lookup table in host + * @param table_shape table shape + */ +void cvm_gen_reciprocal_mantissa(uint16_t *table_mantissa, cvk_tl_shape_t *table_shape); + +/** + * @brief syntactic sugar for cvm_gen_reciprocal/cvm_gen_reciprocal_mantissa + * + * @param [out] sqrt_table_data bf16 exponent part lookup table in host + * @param [out] sqrt_table_data_mantissa bf16 fraction part lookup table in host + * @param table_shape table shape + */ +void cvm_reciprocal_tbl(uint16_t *table_data, uint16_t *table_mantissa, + cvk_tl_shape_t *table_shape); + +/** + * @brief implement reciprocal in tpu memory + * + * @param cvk_ctx kernel structure + * @param tl_ifmap input tensor in tpu memory + * @param tl_buf working buffer + * @param tbl_answer lookup table tensor for bf16 exponent part in tpu memory + * @param tbl_answer_mantissa lookup table tensor for fraction part in tpu memory + * @param [out] tl_ofmap_bf16 result in in memory + * + * @example + * int align = 1; // align eu(excution unit) + * // 1. alloc in tpu memory + * // 2. prepare table + * cvm_reciprocal_tbl(table_data, table_data_mantissa, &table_shape); + * // 3. put host data to tpu memory + * // 4. prepare command buffer + * cvm_emit_reciprocal(cvk_ctx, tl_ifmap, tl_buf, cvk_tl_table_answer, + * cvk_tl_table_answer_mantissa, tl_ofmap_bf16); + * + * // 5. submit it + * test_submit_comp(rt_ctx, cvk_ctx); + * + * // 6. get result from tpu memory + * uint16_t* result = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16, + * tl_ofmap_bf16->fmt); + * + * @return status, 0 means success, other means generates command fail + */ +int cvm_emit_reciprocal(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa, + cvk_tl_t *tl_ofmap_bf16); + +/** + * @brief generate sigmoid lookup table in host, + * we leverage Linear interpolation fairly close to the original + * you can refer [wiki](https://en.wikipedia.org/wiki/Interpolation) for more details + * + * @param [out] sigmoid_table_data lookup table in host + * @param [out] sigmoid_table_data_slope slope table in host + * @param table_shape table shape + * @param range_start quantize range from, + * e.g: the original input range is -127 ~ 128, we quantize to -8 ~ 8 + * than -8 is our \range_start and 8 is \range_end + * @param range_end quantize range end + */ +void cvm_sigmoid_tbl(uint16_t *sigmoid_table_data, uint16_t *sigmoid_table_data_slope, + cvk_tl_shape_t *table_shape, int range_start, int range_end); + +/** + * @brief get scale factor from \range_start and \range_end + * + * @param range_start quantize range from + * @param range_end quantize range end + * + * @return scale factor + */ +float cvm_sigmoid_scale(int range_start, int range_end); + +/** + * @brief get sigmoid value by linear interpolation + * + * @param cvk_ctx kernel structure + * @param tl_ifmap input tensor in tpu memory + * @param tl_buf working buffer + * @param tl_table_answer sigmoid table in tpu memory generated by \cvm_sigmoid_tbl + * @param tl_table_answer_slope sigmoid slope table in tpu memory generated by \cvm_sigmoid_tbl + * @param [out] tl_ofmap_bf16 result in in memory + * @param scale scale factor generated by \cvm_sigmoid_scale + * + * @example + * // 1. alloc in tpu memory + * // 2. prepare table + * cvm_sigmoid_tbl(table_data, table_data_slope, &table_shape, range_start, range_end); + * float scale = cvm_sigmoid_scale(range_start, range_end); + * // 3. put host data to tpu memory + * // 4. prepare command buffer + * cvm_emit_sigmoid(cvk_ctx, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_slope, + * tl_ofmap_bf16, scale); + * // 5. submit it + * test_submit_comp(rt_ctx, cvk_ctx); + * + * // 6. get result from tpu memory + * uint16_t* result = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16, + * tl_ofmap_bf16->fmt); + * + * @return status, 0 means success, other means generates command fail + */ +int cvm_emit_sigmoid(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tl_table_answer, cvk_tl_t *tl_table_answer_slope, + cvk_tl_t *tl_ofmap_bf16, float scale); + +/** + * @brief General Matrix Multiplication + * that equal \lhs_gaddr * \rhs_gaddr = \dest_gaddr + * + * @param cvk_ctx kernel structure + * @param lhs_gaddr left hand side device memory address + * @param rhs_gaddr right hand side device memory address + * @param dest_gaddr destination device memory address + * @param in_row \lhs_gaddr matrix row + * @param in_col \lhs_gaddr matrix col + * @param out_col \dest_gaddr matrix col + * @param fmt the possible value is \CVK_FMT_BF16 or \CVK_FMT_I8 or \CVK_FMT_U8 + * @example + * + * // 1. alloc host memory and put it to device memory + * // M=in_row K=in_col N=out_col + * cvk_mg_t *mg_A = _test_put_matrix_g(&ctx, M, K, CVK_FMT_BF16, (uint8_t *)bf16_A); + * cvk_mg_t *mg_B = _test_put_matrix_g(&ctx, K, N, CVK_FMT_BF16, (uint8_t *)bf16_B); + * cvk_mg_t *mg_R = _test_put_matrix_g(&ctx, M * 2, N, CVK_FMT_BF16, (uint8_t *)bf16_R); + * + * // 2. get device address for gemm + * gaddr_t gaddr_a = mg_A->start_address; + * gaddr_t gaddr_b = mg_B->start_address; + * gaddr_t gaddr_r = mg_R->start_address; + * + * // 3. prepare gemm descriptor + * cvm_gemm(cvk_ctx, gaddr_a, gaddr_b, gaddr_r, M, K, N); + * + * // 4. submit descriptor + * test_submit_comp(&ctx, cvk_ctx); + * + * // 5. get result from device to host + * uint16_t *bf16_ref = (uint16_t *)test_get_mg_mem_comp(&ctx, mg_R); + * + * @ return slice_num array of {M, N, K} + */ +size_t *cvm_gemm(cvk_context_t *cvk_ctx, uint64_t lhs_gaddr, uint64_t rhs_gaddr, + uint64_t dest_gaddr, int in_row, int in_col, int out_col, cvk_fmt_t fmt); + +/** + * @brief combine \cvm_gemm int8 result to int32 + * the raw output is seperate 32bit result info 4 part with bstride + * and we need to 'combine' it to human readable + * for instance, the following is the raw result + * lsb 31 msb + * 0x1 0x2 0x3 0x4 0x5 0x6 0x7 0x8 + * 0x9 0xa 0xb 0xc 0xd 0xe 0xf 0x0 + * 0x11 0x12 0x13 0x14 0x15 0x16 0x17 0x18 + * 0x19 0x20 0x21 0x22 0x23 0x24 0x25 0x26 + * + * the value by strategy could be column major: + * 1. 0x19110901 + * 2. 0x20120a02 + * 3. 0x21130b03 + * and so on + * + * @param cvm_gemm_strategy return strategy value from \cvm_gemm + * @param cvm_output raw result from \cvm_gemm + * @param [out] i32_R int32 result + * @param M row of output matrix + * @param N column of output matrix + * + * @return status, 0 means success, other means generates command fail + */ +int cvm_combin_gemm_i8(size_t *cvm_gemm_strategy, uint8_t *cvm_output, uint32_t *i32_R, int M, + int N); +/** + * @brief fp32 to bf16 format int device memory + * + * @param cvk_ctx kernel structure + * @param gaddr_fp32 fp32 data with device memory address + * @param fp32_shape fp32 tensor shape + * @param [out] gaddr_bf16 bf16 data with device memory address + * @param bf16_shape bf16 tensor shape + * @param fmt tensor format such as \CVK_FMT_BF16 + * + * @example + * + * cvk_tl_shape_t s = {1, 2, 3, 4} + * // 1. put fp32 to device memory + * test_put_tg_mem_comp(rt_ctx, tg_with_fp32, data) + * // 2. init bf16 tg + * // 3. prepare command buffer + * cvm_s2s_fp32_bf16(cvk_ctx, tg_with_fp32->start_address, tg_with_fp32->shape, + * tg_with_bf16->start_address, tg_with_bf16->shape, CVK_FMT_BF16); + * // 4. submit it + * test_submit_comp(rt_ctx, cvk_ctx); + * // 5. get result from device memory + * uint16_t *dst_data = (uint16_t *)test_get_tg_mem_comp(rt_ctx, tg_with_bf16); + * + * @return status, 0 means success, other means generates command fail + */ +int cvm_s2s_fp32_bf16(cvk_context_t *cvk_ctx, uint64_t gaddr_fp32, cvk_tg_shape_t fp32_shape, + uint64_t gaddr_bf16, cvk_tg_shape_t bf16_shape, cvk_fmt_t fmt); + +/** + * @brief generate lookup table for check input is 0 or not + * + * @param [out] table_0 lookup table for 0 or not + * @param table_shape table shape + */ +void cvm_gen_0_tbl(uint16_t *table_0, cvk_tl_shape_t *table_shape); + +// mask function +/** + * @brief get mask value that seperate 0 or not + * e.g: input = [0, 1, -1, 2] output [1, 0, 0, 0] + * please see \cvm_emit_mask for more details + * + * @param cvk_ctx kernel structure + * @param tl_ifmap input in tpu memory + * @param tl_buf working buffer + * @param tbl_answer lookup table for 0 or not in tpu memory, generate by \cvm_gen_0_tbl + * @param [out] tl_ofmap_bf16 mask result in tpu memory + * @param fmt tensor format such as \CVK_FMT_BF16 + * + * @return status, 0 means success, other means generates command fail + */ +int cvm_emit_0_idx(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tbl_answer, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); + +/** + * @brief get mask value that check < 0 + * e.g: input = [0, 10, 6, -1, 0] output [0, 0, 0, 1, 0] + * please see \cvm_emit_mask for more details + * + * @param cvk_ctx kernel structure + * @param tl_ifmap input in tpu memory + * @param tl_buf working buffer + * @param tl_pos_neg_buf lookup table generate from \cvm_pos_neg_tbl + * @param [out] tl_ofmap_bf16 mask result in tpu memory + * @param fmt tensor format such as \CVK_FMT_BF16 + * + * @return status, 0 means success, other means generates command fail + */ +int cvm_emit_neg_idx(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); + +/** + * @brief get mask value that check >= 0 + * e.g: input = [0, 10, 6, -1, 0] output [0, 1, 1, 0, 0] + * please see \cvm_emit_mask for more details + * + * @param cvk_ctx kernel structure + * @param tl_ifmap input in tpu memory + * @param tl_buf working buffer + * @param tl_pos_neg_buf lookup table generate from \cvm_pos_neg_tbl + * @param [out] tl_ofmap_bf16 mask result in tpu memory + * @param fmt tensor format such as \CVK_FMT_BF16 + * + * @return status, 0 means success, other means generates command fail + */ +int cvm_emit_pos_idx(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); + +/** + * @brief invert 0/1 input + * e.g: input = [0, 1, 1, 1, 0] output [1, 0, 0, 0, 1] + * + * @param cvk_ctx kernel structure + * @param tl_ifmap input in tpu memory + * @param tl_buf working buffer + * @param [out] tl_ofmap_bf16 mask result in tpu memory + * @param fmt + * + * @return status, 0 means success, other means generates command fail + */ +int cvm_emit_0_1_revert_input(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); + +// mask enum define +enum CVM_MASK_TYPE { + CVM_MASK_TYPE_GT_0 = 0, // remain > 0 + CVM_MASK_TYPE_GE_0, // remain >= 0 + CVM_MASK_TYPE_EQ_0, // remain = 0 + CVM_MASK_TYPE_LT_0, // remain < 0 + CVM_MASK_TYPE_LE_0, // remain <= 0 + CVM_MASK_MAX +}; + +/** + * @brief get mask for \CVM_MASK_TYPE case + * + * @param cvk_ctx kernel structure + * @param tl_ifmap input in tpu memory + * @param tl_buf working buffer + * @param tl_buf2 working buffer + * @param tl_buf3 working buffer + * @param tl_pos_neg_table lookup table generate from \cvm_pos_neg_tbl + * @param tl_0_idx_table lookup table for 0 or not in tpu memory generated by \cvm_gen_0_tbl + * @param [out] tl_ofmap_bf16 mask result in tpu memory + * @param fmt tensor format such as \CVK_FMT_BF16 + * @param mask \CVM_MASK_TYPE + * + * @example + * // 1. alloc in tpu memory + * // 2. prepare table + * cvm_gen_0_tbl(idx_0_table_data, &table_shape); + * cvm_pos_neg_tbl(table_data_atan_pos_neg, &table_shape); + * // 3. put host data to tpu memory + * // 4. prepare command buffer + * cvm_emit_mask(cvk_ctx, + * tl_ifmap, // input + * tl_buf, tl_buf2, tl_buf4, // tmp buffer + * tl_pos_neg_buf, tl_0_idx_table, // lookup table + * tl_ofmap_bf16, // output + * fmt, mode); + * + * // 5. submit it + * test_submit_comp(rt_ctx, cvk_ctx); + * + * // 6. get result from tpu memory + * uint16_t* result = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16, + * tl_ofmap_bf16->fmt); + * + * + * @return status, 0 means success, other means generates command fail + */ +int cvm_emit_mask(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, + cvk_tl_t *tl_buf3, cvk_tl_t *tl_pos_neg_table, cvk_tl_t *tl_0_idx_table, + cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt, enum CVM_MASK_TYPE mask); + +/** + * @brief generate lookup table for atan by degree + * + * @param [out] table_data_y0 atan by degree lookup table in host + * @param table_shape table shape + */ +void cvm_atan_fast_degree_y0(uint16_t *table_data_y0, cvk_tl_shape_t *table_shape); + +/** + * @brief generate lookup table for check value of absolute in [0,1] or > 1 + * atan2 used, [0-1] indicate 1, > 1 indicate with -1 + * + * @param [out] table_invert lookup table in host + * @param table_shape table shape + */ +void cvm_atan_s_01(uint16_t *table_invert, cvk_tl_shape_t *table_shape); + +/** + * @brief generate table for check input value is positive(>=0) or negtive(<0) + * by lookup table, 'pos_neg' means data is positive(>=0) is 1 or negtive(<0) is -1 + * + * @param [out] table_pos_neg lookup table in host + * @param table_shape table shape + */ +void cvm_pos_neg_tbl(uint16_t *table_pos_neg, cvk_tl_shape_t *table_shape); + +// deprecated code from \cvm_pos_neg_tbl +void cvm_atan_pos_neg(uint16_t *table_pos_neg, cvk_tl_shape_t *table_shape); + +/** + * @brief generate atan answer by lookup table, + * plz refer [git](https://github.com/xiezhq-hermann/atan_lookup) for more details + * + * @param [out] table_data_y0 atan answer lookup table in host + * @param table_shape table shape + */ +void cvm_atan_y0(uint16_t *table_data_y0, cvk_tl_shape_t *table_shape); + +/** + * @brief generate atan slope data, for more accuracy + * + * @param [out] table_slope atan slope lookup table in host + * @param table_shape table shape + */ +void cvm_atan_slope(uint16_t *table_slope, cvk_tl_shape_t *table_shape); + +/** + * @brief syntactic sugar for cvm_atan_y0/cvm_atan_slope/cvm_atan_s_01/cvm_pos_neg_tbl + * + * @param [out] table_data_atan_y0 atan answer lookup table in host + * @param [out] table_data_atan_slope atan slope lookup table in host + * @param [out] table_data_atan_invert lookup table in host + * @param [out] table_data_atan_pos_neg lookup table in host + * @param table_shape table shape + */ +void cvm_atan_tbl(uint16_t *table_data_atan_y0, uint16_t *table_data_atan_slope, + uint16_t *table_data_atan_invert, uint16_t *table_data_atan_pos_neg, + cvk_tl_shape_t *table_shape); + +/** + * @brief implement atan in tpu memory + * + * @param cvk_ctx kernel structure + * @param tl_ifmap input tensor in tpu memory + * @param tl_buf working buffer + * @param tl_buf2 working buffer + * @param tl_buf3 working buffer + * @param tl_y0_buf atan lookup table in tpu memory + * @param tl_slope_buf atan slope lookup table in tpu memory + * @param tl_invert_buf lookup table in tpu memory + * @param tl_pos_neg_buf lookup table in memory + * @param tl_table_answer reciprocal for bf16 exponent part in tpu memory + * @param tl_table_answer_mantissa reciprocal for bf16 fraction part in tpu memory + * @param [out] tl_ofmap_bf16 result in tpu memory + * @param fmt tensor format such as \CVK_FMT_BF16 + * + * @example + * // 1. alloc in tpu memory + * // 2.1. get reciprocal table in host + * cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape); + * // 2.2. get atan table in host + * cvm_atan_tbl(table_data_atan_y0, table_data_atan_slope, table_data_atan_invert, + * table_data_atan_pos_neg, &table_shape); + * // 3. put host data to tpu memory + * // 4. prepare command buffer + * cvm_atan_emit(cvk_ctx, tl_ifmap, tl_buf, tl_buf2, tl_buf4, tl_y0_buf, + * tl_slope_buf, tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer, + * tl_reciprocal_table_answer_mantissa, tl_ofmap_bf16, fmt); + * + * // 5. submit it + * test_submit_comp(rt_ctx, cvk_ctx); + * // 6. get result from tpu memory + * uint16_t *ofmap_data = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16, + * tl_ofmap_bf16->fmt); + * @return status, 0 means success, other means generates command fail + */ +int cvm_atan_emit(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, + cvk_tl_t *tl_buf3, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_slope_buf, + cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer, + cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); +// atan2 function +/** + * @brief syntactic sugar for generate atan in degree lookup table in + * host/cvm_atan_s_01/cvm_pos_neg_tbl + * + * @param [out] table_data_atan_y0 atan answer lookup table in host + * @param [out] table_data_atan_invert lookup table in host + * @param [out] table_data_atan_pos_neg lookup table in host + * @param table_shape table shape + */ +void cvm_atan_fast_degree_tbl(uint16_t *table_data_atan_y0, uint16_t *table_data_atan_invert, + uint16_t *table_data_atan_pos_neg, cvk_tl_shape_t *table_shape); + +/** + * @brief implement atan2 by degree in tpu memory, implemented by atan. you can refer + * [wiki](https://en.wikipedia.org/wiki/Atan2) for more details + * + * @param cvk_ctx kernel structure + * @param y input tensor in tpu memory + * @param x input tensor in tpu memory + * @param tl_buf working buffer + * @param tl_buf2 working buffer + * @param tl_buf3 working buffer + * @param tl_y0_buf atan2 lookup table in tpu memory + * @param tl_invert_buf lookup table in tpu memory + * @param tl_pos_neg_buf lookup table in memory + * @param tl_table_answer reciprocal for bf16 exponent part in tpu memory + * @param tl_table_answer_mantissa reciprocal for bf16 fraction part in tpu memory + * @param [out] tl_ofmap_bf16 result in tpu memory + * @param fmt tensor format such as \CVK_FMT_BF16 + * + * @example + * // 1. alloc in tpu memory + * // 2.1. get reciprocal table in host + * cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape); + * // 2.2. get atan table in host + * cvm_atan_fast_degree_tbl(table_data_atan_y0, table_data_atan_slope, table_data_atan_invert, + * table_data_atan_pos_neg, &table_shape); + * // 3. put host data to tpu memory + * // 4. prepare command buffer + * cvm_atan2_fast_degree_emit( + * cvk_ctx, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, + * tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer, + * tl_reciprocal_table_answer_mantissa, tl_ofmap_bf16, fmt); + * + * // 5. submit it + * test_submit_comp(rt_ctx, cvk_ctx); + * // 6. get result from tpu memory + * uint16_t *ofmap_data = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16, + * tl_ofmap_bf16->fmt); + */ +void cvm_atan2_fast_degree_emit(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf, + cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_y0_buf, + cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf, + cvk_tl_t *tl_table_answer, cvk_tl_t *tl_table_answer_mantissa, + cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); +/** + * @brief implement atan2 in tpu memory, implemented by atan. you can refer + * [wiki](https://en.wikipedia.org/wiki/Atan2) for more details + * + * @param cvk_ctx kernel structure + * @param y input tensor in tpu memory + * @param x input tensor in tpu memory + * @param tl_buf working buffer + * @param tl_buf2 working buffer + * @param tl_buf3 working buffer + * @param tl_y0_buf atan2 lookup table in tpu memory + * @param tl_invert_buf lookup table in tpu memory + * @param tl_pos_neg_buf lookup table in memory + * @param tl_table_answer reciprocal for bf16 exponent part in tpu memory + * @param tl_table_answer_mantissa reciprocal for bf16 fraction part in tpu memory + * @param [out] tl_ofmap_bf16 result in tpu memory + * @param fmt tensor format such as \CVK_FMT_BF16 + * + * @example + * // 1. alloc in tpu memory + * // 2.1. get reciprocal table in host + * cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape); + * // 2.2. get atan table in host + * cvm_atan_fast_degree_tbl(table_data_atan_y0, table_data_atan_slope, table_data_atan_invert, + * table_data_atan_pos_neg, &table_shape); + * // 3. put host data to tpu memory + * // 4. prepare command buffer + * cvm_atan2_fast_degree_emit( + * cvk_ctx, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, + * tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer, + * tl_reciprocal_table_answer_mantissa, tl_ofmap_bf16, fmt); + * + * // 5. submit it + * test_submit_comp(rt_ctx, cvk_ctx); + * // 6. get result from tpu memory + * uint16_t *ofmap_data = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16, + * tl_ofmap_bf16->fmt); + */ +void cvm_atan2_merge_emit(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf, + cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_y0_buf, + cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf, + cvk_tl_t *tl_table_answer, cvk_tl_t *tl_table_answer_mantissa, + cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); + +/** + * @brief get lookup table size for host alloc mamory used + * + * @param cvk_ctx kernel structure + * @param table_shape table shape + * @param fmt tensor format such as \CVK_FMT_BF16 + * + * @return table size in bytes + */ +uint64_t cvm_lut_tbl_bytesize(cvk_context_t *cvk_ctx, cvk_tl_shape_t *table_shape, cvk_fmt_t fmt); + +/** + * @brief calculate new proper reshape channel for depthwise + * current only support batch = 1 + * + * @param cvk_ctx kernel structure + * @param ic origin input shape of c + * @param ih origin input shape of h + * @param iw origin input shape of w + * @param kh origin kerenl shape of h + * @param kw origin kerenl shape of w + * @param pad_right padding right with input + * @param pad_left padding left with input + * @param stride_h stride h with input + * @param stride_w stride w with input + * @param [out] tl_load_shape shape structure for input in tpu memory + * @param [out] new_tl_ifmap_stride deprecated that stride for input in tpu memory + * @param [out] new_tg_ifmap_shape shape structure for input in device memory + * @param [out] new_tg_ifmap_stride stride structure for input in device memory + * @param [out] new_tl_weight_shape reshape weight in tpu memory + * @param [out] new_tl_bias_shape reshape bias in tpu memory + * @param [out] new_tl_ofmap_shape reshape output in tpu memory + * @param fmt the possible value is \CVK_FMT_BF16 or \CVK_FMT_I8 or \CVK_FMT_U8 + * @param eu_align currently MUST set 1 is force align with hardware + * + * @example + * int align = 1; // force align + * cvk_tiu_depthwise_pt_convolution_param_t *p; + * // 1. get reshaped shape + * int r = cvm_reshape_channel_same( + * cvk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, stride_h, stride_w, + * &tl_load_shape, &tl_load_stride, &tg_shape, &tg_stride, &tl_weight_shape, + * &tl_bias_shape, &tl_output_shape, fmt, align); + * // reshape fail + * if (r == -1) { + * return -1; + * } + * + * // 2.1 load input + * // load input into tpu memory + * int load_align = 0; // not align for pack + * tmp_tl_load = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_load_shape, fmt, load_align); + * tmp_tg = test_alloc_tg_mem_comp(&rt_ctx, cvk_ctx, tg_shape, fmt); + * tmp_tg->stride = tg_stride; + + * // int8 + * cvk_tdma_g2l_tensor_copy_param_t p1; + * cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p1); + * test_submit_comp(&rt_ctx, cvk_ctx); + * test_free_tg_mem_comp(&rt_ctx, tmp_tg); + + + * // fit for hw + * int align_in_tl = 1; + * tmp_tl_load->stride = bmk1880v2_tensor_lmem_default_stride( + * cvk_ctx, tmp_tl_load->shape, fmt, align_in_tl); + * p->ifmap = tmp_tl_load; + + * // 2.2 prepare load bias, put to tg and load back + * if (has_bias) { + * // bias must i8 + * int no_bias_align = 0; + * p->bias = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_bias_shape, fmt, no_bias_align); + * + * // duplicate bias and replace old + * uint32_t *new_bias = cvm_reshape_channel_weight( + * (uint8_t *)bias, tl_bias_shape.n, tl_bias_shape.c, tl_bias_shape.h, + * tl_bias_shape.w, org_oc, fmt); + * + * test_put_tensor_g2l_comp(&rt_ctx, cvk_ctx, p->bias, bias); + * } + * + * // 2.3 prepare load weight, put to tg and load back + * { + * int weight_align = 1; + * p->weight = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_weight_shape, fmt, weight_align); + * // duplicate kernel with c + * uint8_t *new_weight = cvm_reshape_channel_weight( + * (uint8_t *)weight, tl_weight_shape.n, tl_weight_shape.c, tl_weight_shape.h, + * tl_weight_shape.w, org_oc, fmt); + * + * test_put_tensor_g2l_comp(&rt_ctx, cvk_ctx, p->weight, (u16 *)weight); + * } + * + * // 2.4 prepard ofmap + * { + * // we allocate 'same' mode shape + * int output_align = 1; // hw need + * p->ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_output_shape, fmt, output_align); + * } + * + * // 3. prepare command buffer + * cvk_ctx->ops->tiu_pt_depthwise_convolution(cvk_ctx, p); + * + * // 4. submit it + * test_submit_comp(rt_ctx, cvk_ctx); + * + * // 5. get result from tpu memory + * output = test_get_tensor_l2g_comp(&rt_ctx, cvk_ctx, p->ofmap, fmt); + * + * @return status, -1 means fail, other means reshape slice success + */ +int cvm_reshape_channel_same(cvk_context_t *cvk_ctx, int ic, int ih, int iw, int kh, int kw, + int pad_right, int pad_left, int stride_h, int stride_w, + cvk_tl_shape_t *tl_load_shape, cvk_tl_stride_t *new_tl_ifmap_stride, + cvk_tg_shape_t *new_tg_ifmap_shape, + cvk_tg_stride_t *new_tg_ifmap_stride, + cvk_tl_shape_t *new_tl_weight_shape, cvk_tl_shape_t *new_tl_bias_shape, + cvk_tl_shape_t *new_tl_ofmap_shape, cvk_fmt_t fmt, int eu_align); + +/** + * @brief re-construct bias content by reshape channel + * + * @param bias original bias in host memory + * @param ni reshape bias shape of n + * @param ci reshape bias shape of c + * @param hi reshape bias shape of h + * @param wi reshape bias shape of w + * @param old_bias_c origin bias shape of c + * @param fmt the possible value is \CVK_FMT_BF16 or \CVK_FMT_I8 or \CVK_FMT_U8 + * + * @return bias host data + */ +uint32_t *cvm_reshape_channel_bias(uint8_t *bias, int ni, int ci, int hi, int wi, int old_bias_c, + cvk_fmt_t fmt); + +/** + * @brief re-construct weight content by reshape channel + * + * @param weight original bias in host memory + * @param ni reshape weight shape of n + * @param ci reshape weight shape of c + * @param hi reshape weight shape of h + * @param wi reshape weight shape of w + * @param old_weight_c origin weight shape of c + * @param fmt the possible value is \CVK_FMT_BF16 or \CVK_FMT_I8 or \CVK_FMT_U8 + * + * @return weight host data + */ +uint8_t *cvm_reshape_channel_weight(uint8_t *weight, int ni, int ci, int hi, int wi, + int old_weight_c, cvk_fmt_t fmt); + +typedef struct cvm_tiu_atan2_param { + cvk_tl_t *a; + cvk_tl_t *b; + cvk_tl_t *res; + cvk_tl_t *buf1; + cvk_tl_t *buf2; + cvk_tl_t *buf3; + cvk_tl_t *buf4; + cvk_tl_t *buf5; + cvk_tl_t *buf6; + cvk_tl_t *y0; + cvk_tl_t *slope; + cvk_tl_t *invert; + cvk_tl_t *pos_neg_table; + cvk_tl_t *reciprocal_table_answer; + cvk_tl_t *reciprocal_table_answer_mantissa; + cvk_tl_t *sqrt_table_answer; + cvk_tl_t *sqrt_table_answer_mantissa; + cvk_tl_t *idx_0_table; + cvk_fmt_t fmt; + bool output_degree; +} cvm_tiu_atan2_param_t; + +typedef struct cvk_tiu_mask_param { + cvk_tl_t *ifmap; + cvk_tl_t *ofmap; + cvk_tl_t *buf; + cvk_tl_t *buf2; + cvk_tl_t *buf3; + cvk_tl_t *pos_neg_table; + cvk_tl_t *idx_0_table; + cvk_fmt_t fmt; +} cvm_tiu_mask_param_t; + +typedef struct cvm_tiu_sigmoid_param { + float scale; + cvk_tl_t *ifmap; + cvk_tl_t *buf; + cvk_tl_t *table_answer; + cvk_tl_t *table_answer_slope; + cvk_tl_t *ofmap; +} cvm_tiu_sigmoid_param_t; + +typedef struct cvm_tiu_sqrt_param { + cvk_tl_t *a; + cvk_tl_t *res; + cvk_tl_t *buf; + cvk_tl_t *sqrt_table_answer; + cvk_tl_t *sqrt_table_answer_mantissa; +} cvm_tiu_sqrt_param_t; + +/** + * @brief get \quantized_multiplier and its \right_shift, + * please refer + * \https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/common.h:MultiplyByQuantizedMultiplier + * for more details + * + * @param real_multiplier + * @param quantized_multiplier + * @param right_shift + */ +void cvm_get_chl_quan(float real_multiplier, uint32_t *quantized_multiplier, int *right_shift); + +/** + * @brief + * + * @param c + * @param quantized_multiplier + * @param right_shift + * @param cal_data + * @param bias_data + * @param has_bias + */ +void cvm_fill_chl_quan_data(const uint32_t c, const uint32_t quantized_multiplier, + const int right_shift, uint8_t *cal_data, int32_t *bias_data, + bool has_bias); + +/** + * @brief + * + * @param c + * @param quantized_multiplier + * @param right_shift + * @param bias_data + * @param has_bias + * + * @return + */ +uint8_t *cvm_get_chl_quan_data(const uint32_t c, const uint32_t quantized_multiplier, + const int right_shift, int32_t *bias_data, bool has_bias); + +/** + * @brief get byte size of input \fmt + * + * @param fmt \cvk_fmt_t structure + * + * @example + * int sz = cvm_bytesize_of_fmt(CVK_FMT_BF16); + * assert (sz == 2 && "bf16 takes 2 bytes") + * + * sz = cvm_bytesize_of_fmt(CVK_FMT_I8); + * assert (sz == 1 && "int8 takes 1 bytes") + * @return byte size of fmt + */ +int cvm_bytesize_of_fmt(cvk_fmt_t fmt); + +/** + * @brief reduce multiplication for h,w + * the possible shape will be <1, c, 1, 1> + * you could refer [here](https://en.wikipedia.org/wiki/Reduction_Operator) for + * more details + * + * @param cvk_ctx kernel structure + * @param [out] mp_tl_mulsum input tensor in tpu memory, the shape should be <1, c, h, w> + * + * @return status, 0 means success, other means generates command fail + */ +int cvm_reduce_hw_mul(cvk_context_t *cvk_ctx, cvk_tl_t *mp_tl_mulsum); + +/** + * @brief bf16 to fp32, ONLY move bf16 to fp32 high 16 bits part, + * the memory layout as following: + * + * bf16: 0x4300 + * 0 16 (bit) + * ----- + * 0x4300 + * + * fp32: 0x43000000 + * ----- + * 0 16 32 + * 0x 0x43 + * + * @param cvk_ctx kernel structure + * @param tg_bf16 bf16 data in device memory + * @param [out] tg_fp32 fp32 data in decive memory, the w shape SHOULD be double with + * \tg_bf16->shape.w + */ +void cvm_bf16_fp32(cvk_context_t *cvk_ctx, cvk_tg_t *tg_bf16, cvk_tg_t *tg_fp32); + +/** + * @brief set value by mask(0/1) + * + * @param [in] tl_ifmap image input, MUST uint8 + * @param [in] tl_mask mask value, it MUST be 0 or 1, it will DIRTY it + * @param [in] tl_buf + * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it + * + * @return status, 0 means success, other means generates command fail + */ +int cvm_set_image_by_u8mask(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tl_mask, cvk_tl_t *tl_ofmap); + +/** + * @brief set value by mask(0/1) by DePthwise + * 0 means keep \tl_ofmap one + * 1 means overwrite with \tl_ifmap + * + * @param [in] tl_ifmap image input, MUST uint8 + * @param [in] tl_mask mask value, it MUST be 0 or 1, it will DIRTY it + * @param [in] tl_kernel for mask reverting(0/1->1/0) that the contain MUST BE -1 with int8 + * and shape SHOULD BE <1, tl_ifmap->shape.c, 1, 1> + * @param [in] tl_bias for mask reverting(0/1->1/0) that the contain MUST BE 1 with int8, + * seperate high/low part, and shape SHOULD BE <2, tl_ifmap->shape.c, 1, 1> + * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it + * + * @return status, 0 means success, other means generates command fail + */ + +int cvm_set_image_by_u8mask_dp(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_mask, + cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, cvk_tl_t *tl_ofmap); + +/** + * @brief set value by mask and threshold, set it + * if \tl_mask && (int8_t)\tl_update_tbl < threshold + * + * @param [in] tl_ifmap image input, MUST uint8 + * @param [in] tl_mask mask value, it MUST be 0 or 1, it will DIRTY it + * @param [in] tl_update_tbl the value range will under int8, it will DIRTY it + * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it + * + * @return status, 0 means success, other means generates command fail + */ +int cvm_set_image_by_two_info_i8(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf2, + cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl, uint8_t threshold, + cvk_tl_t *tl_ofmap); + +/** + * @brief set value by mask and threshold by DePthwise, set it + * if \tl_mask && (int8_t)\tl_update_tbl < threshold + * + * @param [in] tl_ifmap image input, MUST uint8 + * @param [in] tl_kernel set all to 1 for \tl_update_tbl * 1 - threshold + * to test larger or smaller, + * that MUST BE 1 with int8 and shape SHOULD BE <1, tl_ifmap->shape.c, 1, 1> + * @param [in] tl_mask mask value, it MUST be 0 or 1, it will DIRTY it + * @param [in] tl_update_tbl the value range will under int8, it will DIRTY it + * @param [in] tl_threshold for boradcast \threshold to bias + * the type MUST BE int8 and seperate high/low part and it will DIRTY it + * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it + * + * @return status, 0 means success, other means generates command fail + */ + +int cvm_set_image_by_two_info_i8_dp(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_kernel, + cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl, + cvk_tl_t *tl_threshold, cvk_tl_t *tl_ofmap); + +/** + * @brief get abs(\tl_ifmap-tl_ifmap2) + * + * @param [in] tl_ifmap image input, MUST uint8 + * @param [in] tl_ifmap2 image input, MUST uint8, it will DIRTY it + * @param [out] tl_ofmap image output, MUST uint8, it will DIRTY it + * + * @return status, 0 means success, o, MUST uint8ther means generates command fail + */ +int cvm_gen_image_diff(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2, + cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, cvk_tl_t *tl_ofmap); + +/** + * @brief update \tl_ofmap by \threshold_a, \threshold_b, + * plz refer \sample_set_val_by_mask.cpp for more details + * + * @param [out] tl_mask return 0/1 mask + * @param [in] tl_update_tbl u8 + * @param [in,out] tl_ofmap image output, int8 + * + * @return status, 0 means success, other means generates command fail + */ +int cvm_update_tbl_by_threshold(cvk_context_t *ctx, cvk_tl_t *tl_mask, cvk_tl_t *tl_buf, + cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_update_tbl, + uint8_t threshold_a, uint8_t threshold_b, cvk_tl_t *tl_ofmap); + +/** + * @brief set value by mask, update \tl_ofmap once (uint8_t)tl_update_tbl >= threshold + * + * @param [in] tl_ifmap image input, MUST uint8 + * @param [in] tl_update_tbl the value range will under uint8 + * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it + * + * @return status, 0 means success, other means generates command fail + */ +int cvm_set_image_by_two_info_u8(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tl_buf2, cvk_tl_t *tl_update_tbl, uint8_t threshold, + cvk_tl_t *tl_ofmap); + +/** + * @brief set value by mask + * if (int8_t)\tl_update_tbl > threshold + * + * @param [in] tl_ifmap image input + * @param [in] tl_update_tbl int8, MUST uint8, it will DIRTY + * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it + * + * @return status, 0 means success, other means generates command fail + */ +int cvm_blend_image_by_tbl(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tl_buf2, cvk_tl_t *tl_update_tbl, uint8_t threshold, + uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap); +/** + * @brief get upsample 2d with nearest mode + * + * @param [in] tl_ifmap + * @param [in] tl_weight upsample used that fill with 1 + * @param [out] tl_ofmap + * + * @return status, 0 means success, other means generates command fail + */ + +int cvm_upsample2d(cvk_context_t *ctx, cvk_tl_t *tl_input, cvk_tl_t *tl_weight, + cvk_tl_t *tl_output); +#ifdef __cplusplus +} +#endif + +#endif // CVIMATH_INTERNAL_H diff --git a/cvimath/include/test_cvikernel_util.h b/cvimath/include/test_cvikernel_util.h new file mode 100644 index 000000000..75fc164f5 --- /dev/null +++ b/cvimath/include/test_cvikernel_util.h @@ -0,0 +1,393 @@ +#ifndef CVIMATH_TEST_UTIL_H +#define CVIMATH_TEST_UTIL_H + +#include +#include "cvikernel/cvikernel.h" + +#include "bmruntime.h" +#include "bmruntime_bmkernel.h" + +#include +#include // pow +#include // uint8_t / uint16_t +#include /* printf, scanf, NULL */ +#include /* malloc, free, rand */ +#include // strncpy + +// copy from lagency +// TODO: move to properly header files +#define __ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask)) +#define ALIGN(x, a) __ALIGN_MASK(x, (__typeof__(x))(a)-1) +typedef uint32_t laddr_t; +typedef uint64_t gaddr_t; +typedef uint32_t ctrl_t; +#define CTRL_NULL 0 +#define CTRL_AL (1 << 0) // alloc aligned with EU_NUM +#define CTRL_TP (1 << 5) // transpose +#define CTRL_NEURON (1 << 11) // mark neuron address in GDMA + +#define LADDR_INVALID (0xFFFFFFFF) +#define GADDR_INVALID (0x000000FFFFFFFFFFULL) +static inline int ceiling_func(int numerator, int denominator) { + return (numerator + denominator - 1) / denominator; +} +static inline int ceiling_func_shift(int numerator, int shift) { + return (numerator + (1 << shift) - 1) >> shift; +} +static inline int get_num_shift(uint64_t num) { + int n = 0; + while (!(num & 1)) { + n++; + num >>= 1; + } + return n; +} + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * bm runtime binds with bm kernel. + * cvi kernel still needs bm runtime. + * + * Need to create the separate function to combine bm runtime and cvi kernel. + * Function with postfix _comp (compatible) for such combination. + */ + +#define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) + +/** + * @brief submit command buffer + * + * @param rt_ctx runtime structure + * @param cvk_ctx kernel structure + */ +void test_submit_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx); + +/** + * @brief alloc tensor from device memory + * + * @param rt_ctx runtime structure + * @param cvk_ctx kernel structure + * @param shape tensor shape + * @param fmt tensor format such as \CVK_FMT_U16 or \CVK_FMT_U8 + * + * @return cvk_tg_t structure + */ +cvk_tg_t *test_alloc_tg_mem_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, + cvk_tg_shape_t shape, cvk_fmt_t fmt); + +/** + * @brief alloc matrix from device memory + * + * @param rt_ctx runtime structure + * @param shape matrix shape + * @param fmt tensor format such as \CVK_FMT_U16 or \CVK_FMT_U8 + * + * @return cvk_mg_t structure + */ +cvk_mg_t *test_alloc_mg_mem_comp(CVI_RT_HANDLE *rt_ctx, cvk_mg_shape_t shape, cvk_fmt_t fmt); + +/** + * @brief free tensor from device memory + * + * @param rt_ctx runtime structure + * @param tg pointer of tg + */ +void test_free_tg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_tg_t *tg); + +/** + * @brief free matrix from device memory + * + * @param rt_ctx runtime structure + * @param mg pointer of mg + */ +void test_free_mg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_mg_t *mg); + +/** + * @brief put host data to alloced tensor device memory + * + * @param rt_ctx runtime structure + * @param tg pointer of tg + * @param data[] host data + */ +void test_put_tg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_tg_t *tg, uint8_t data[]); + +/** + * @brief put host data to alloced matrix device memory + * + * @param rt_ctx runtime structure + * @param mg pointer of mg + * @param data[] host data + */ +void test_put_mg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_mg_t *mg, uint8_t data[]); + +/** + * @brief syntactic sugar for \test_alloc_mg_mem_comp -> \test_put_mg_mem_comp + * + * @param rt_ctx runtime structure + * @param mg_data_format mg format such as \CVK_FMT_U16 or \CVK_FMT_U8 + * @param data[] host data + * + * @return + */ +cvk_mg_t *test_put_matrix_g(CVI_RT_HANDLE *rt_ctx, const cvk_mg_shape_t shape, + cvk_fmt_t mg_data_format, uint8_t data[]); + +/** + * @brief get tensor data from device memory + * + * @param rt_ctx runtime structure + * @param tg pointer of tg + * + * @return data in device memory + */ +uint8_t *test_get_tg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_tg_t *tg); + +/** + * @brief get matrix data from device memory + * + * @param rt_ctx runtime structure + * @param mg pointer of mg + * + * @return data in device memory + */ +uint8_t *test_get_mg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_mg_t *mg); + +/** + * @brief get tensor data from tpu memory, + * the data path should be tpu memory -> device memory -> host memory + * + * @param rt_ctx runtime structure + * @param cvk_ctx kernel structure + * @param tl pointer of tl + * + * @return data in tpu memory + */ +uint8_t *test_get_tensor_l2g_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, + const cvk_tl_t *tl); + +/** + * @brief get matrix data from tpu memory, + * the data path should be tpu memory -> device memory -> host memory + * + * @param rt_ctx runtime structure + * @param cvk_ctx kernel structure + * @param ml pointer of ml + * + * @return data in tpu memory + */ +uint8_t *test_get_matrix_l2g_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, + const cvk_ml_t *ml); + +/** + * @brief put host data to tpu memory with tensor + * the data path should be host memory -> device memory -> tpu memory + * + * @param rt_ctx runtime structure + * @param cvk_ctx kernel structure + * @param tl pointer of tl + * @param data[] data in host memory + */ +void test_put_tensor_g2l_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, const cvk_tl_t *tl, + + uint8_t data[]); + +/** + * @brief put host data to tpu memory with matrix + * the data path should be host memory -> device memory -> tpu memory + * + * @param rt_ctx runtime structure + * @param cvk_ctx kernel structure + * @param ml pointer of ml + * @param data[] data in host memory + */ +void test_put_matrix_g2l_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, const cvk_ml_t *ml, + uint8_t data[]); + +/** + * @brief alloc tensor from tpu memory + * + * @param cvk_ctx kernel structure + * @param shape shape of tensor + * @param fmt tensor format such as \CVK_FMT_U16 or \CVK_FMT_U8 + * @param eu_align is align excution unit + * + * @return pointer of tl + */ +cvk_tl_t *test_alloc_tl(cvk_context_t *cvk_ctx, cvk_tl_shape_t shape, cvk_fmt_t fmt, int eu_align); + +/** + * @brief free tpu memory with tensor + * + * @param cvk_ctx kernel structure + * @param tl pointer of tl + */ +void test_free_tl(cvk_context_t *cvk_ctx, const cvk_tl_t *tl); + + +/** + * @brief a small structure for getting RT memory information + */ +typedef struct _AddrInfo +{ + uint64_t phy_addr; + uint64_t size_bytes; + uint8_t *vir_addr; + int mem; +}AddrInfo; + +/** + * @brief get tpu global memory and assign info to an structure + * + * @param[in] bm_ctx runtime structure + * @param[out] pAddrInfo a structure for physical, virtual address + */ +uint8_t *test_get_vp_addr(bmctx_t *ctx, AddrInfo *pAddrInfo); + +/** + * @brief free tpu global memory from an info structure + * + * @param[in] bm_ctx runtime structure + * @param[in] pAddrInfo a structure for physical, virtual address + */ +void test_free_vp_addr(bmctx_t *ctx, AddrInfo *pAddrInfo); + + +/** + * @breif wrapper function + */ +// tensor in local functions +// get tl size +static inline uint64_t tl_shape_size(const cvk_tl_shape_t *s) { + return (uint64_t)s->n * s->c * s->h * s->w; +} + +static inline uint64_t tg_shape_size(const cvk_tg_shape_t *s) { + return (uint64_t)s->n * s->c * s->h * s->w; +} + +static inline uint64_t mg_shape_size(const cvk_mg_shape_t *s) { return (uint64_t)s->row * s->col; } + +static inline void free_tl(cvk_context_t *cvk_ctx, const cvk_tl_t *t) { + return cvk_ctx->ops->lmem_free_tensor(cvk_ctx, t); +} + +typedef struct { + cvk_fmt_t src_fmt; + cvk_fmt_t dst_fmt; +} cvk_fmt_type; + +static inline int bitsize_of_fmt(cvk_fmt_t fmt) { + switch (fmt) { + case CVK_FMT_F32: + case CVK_FMT_I32: + return 32; + case CVK_FMT_F16: + case CVK_FMT_I16: + case CVK_FMT_U16: + case CVK_FMT_BF16: + return 16; + case CVK_FMT_I8: + case CVK_FMT_U8: + return 8; + case CVK_FMT_I4: + return 4; + case CVK_FMT_I2: + return 2; + case CVK_FMT_I1: + return 1; + default: + assert(0); + return -1; + } +} +static inline int bytesize_of_fmt(cvk_fmt_t fmt) { return bitsize_of_fmt(fmt) / 8; } +static inline void tg_2_tl_shape(cvk_tl_shape_t *tl, cvk_tg_shape_t *tg) { + tl->n = tg->n; + tl->c = tg->c; + tl->h = tg->h; + tl->w = tg->w; +} + +static inline void tl_2_tg_shape(cvk_tg_shape_t *tg, cvk_tl_shape_t *tl) { + tg->n = tl->n; + tg->c = tl->c; + tg->h = tl->h; + tg->w = tl->w; +} +/** + * @brief init test case with runtime/kernel + * + * @param rt_ctx runtime structure + * @param cvk_ctx kernel structure + */ +// static inline void _test_init(CVI_RT_HANDLE ctx, cvk_context_t **cvk_ctx) { +// CVI_RT_HANDLE _ctx = (CVI_RT_HANDLE)ctx; +// int ret = CVI_RT_Init(&_ctx); +// if (ret != CVI_SUCCESS) { +// fprintf(stderr, "init failed, err %d\n", ret); +// exit(-1); +// } +// +// int alloc_size = 0x10000; +// *cvk_ctx = (cvk_context_t*) CVI_RT_RegisterKernel(_ctx, alloc_size); +// printf("alloc command buffer %d bytes success\n", alloc_size); +//} +// static inline void _test_exit(CVI_RT_HANDLE ctx, cvk_context_t *cvk_ctx) { +// CVI_RT_UnRegisterKernel(cvk_ctx); +// CVI_RT_HANDLE _ctx = (CVI_RT_HANDLE)ctx; +// CVI_RT_DeInit(_ctx); +//} + +static inline void test_init(CVI_RT_HANDLE *ctx, cvk_context_t **cvk_ctx) { + CVI_RT_HANDLE *_ctx = (CVI_RT_HANDLE *)ctx; + int ret = CVI_RT_Init(_ctx); + if (ret != CVI_SUCCESS) { + fprintf(stderr, "init failed, err %d\n", ret); + exit(-1); + } + + int alloc_size = 0x100000; + *cvk_ctx = (cvk_context_t *)CVI_RT_RegisterKernel(*_ctx, alloc_size); + printf("alloc command buffer %d bytes success\n", alloc_size); +} + +/** + * @brief de-init with runtime/kernel + * + * @param rt_ctx runtime structure + * @param cvk_ctx kernel structure + */ +static inline void test_exit(CVI_RT_HANDLE *ctx, cvk_context_t *cvk_ctx) { + CVI_RT_UnRegisterKernel(cvk_ctx); + CVI_RT_HANDLE *_ctx = (CVI_RT_HANDLE *)ctx; + CVI_RT_DeInit(*_ctx); +} + +// converter bf16<->int8 +uint8_t convert_bf16_u8(uint16_t data); +int8_t convert_bf16_s8(uint16_t data); +uint16_t convert_int8_bf16(uint8_t data, uint8_t sign); +uint32_t convert_fp32_u32(float fp32); +float convert_hex_fp32(uint32_t hval); +uint32_t convert_fp32_hex(float val); +float convert_bf16_fp32(uint16_t bf16); +uint16_t convert_fp32_bf16(float fp32); +int set_store_feround(); +void restore_feround(int round_mode); + +static inline void *xmalloc(size_t size) { + void *p = malloc(size); + if (!p) { + return NULL; + } + return p; +} + +#ifdef __cplusplus +} +#endif + +#endif // CVIMATH_TEST_UTIL_H diff --git a/cvimath/sample/CMakeLists.txt b/cvimath/sample/CMakeLists.txt new file mode 100644 index 000000000..bd5d55385 --- /dev/null +++ b/cvimath/sample/CMakeLists.txt @@ -0,0 +1,28 @@ +project(cvimath_sample) + +# wrapper source + +# include header +include_directories( + ${CMAKE_SOURCE_DIR}/include + ${TPU_SDK_ROOT}/include + ${TPU_SDK_ROOT}/include/cvimath + ) + +# add libs +set( TPU_KERNEL_LIB "-L${TPU_SDK_ROOT}/lib -lcvikernel") +set( TEST_LIBS cvimath cviruntime) + +file(GLOB CVI1835_SAMPLE ./*.cpp) + +foreach(SAMPLE_SRC ${CVI1835_SAMPLE}) + get_filename_component(SAMPLE_NAME ${SAMPLE_SRC} NAME_WE) + + add_executable(${SAMPLE_NAME} ${SAMPLE_UTIL} ${SAMPLE_SRC}) + target_link_libraries(${SAMPLE_NAME} ${TPU_KERNEL_LIB} ${TEST_LIBS}) + set_target_properties(${SAMPLE_NAME} PROPERTIES COMPILE_FLAGS "-Werror -Wall -Wextra") + install(TARGETS ${SAMPLE_NAME} DESTINATION bin) + + add_test(${SAMPLE_NAME} ${SAMPLE_NAME} ctest_test) + +endforeach() diff --git a/cvimath/sample/README.md b/cvimath/sample/README.md new file mode 100644 index 000000000..8afdc32cd --- /dev/null +++ b/cvimath/sample/README.md @@ -0,0 +1,21 @@ +# CVIMath + +## How to build + +### Requirements + +1. MLIR SDK + +SOC mode + +``` +$ mkdir build +$ cd build +$ cmake -G Ninja .. -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \ + -DTOOLCHAIN_ROOT_DIR=${PWD}/../../gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu \ + -DCMAKE_TOOLCHAIN_FILE=${PWD}/../toolchain-aarch64-linux.cmake \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_INSTALL_PREFIX= \ + -DTPU_SDK_ROOT= +$ ninja -j8 && ninja install +``` diff --git a/cvimath/sample/sample_bf16_fp32.cpp b/cvimath/sample/sample_bf16_fp32.cpp new file mode 100644 index 000000000..e4f0f9a64 --- /dev/null +++ b/cvimath/sample/sample_bf16_fp32.cpp @@ -0,0 +1,130 @@ +// \file mask sample for gt(great than), ge(great equal), eq(equal), lt(less than), le(less equal) + +// header include +#include +#include // math +#include // kerenl + +void init_input(uint16_t *input_data, uint64_t ifmap_size) { + for (uint64_t i = 0; i < ifmap_size; i++) { + input_data[i] = convert_fp32_bf16(i * 1.0); + } +} + +void init_ref(uint16_t *input_data, uint32_t *ref_data, uint64_t ifmap_size) { + union s { + uint16_t int16[2]; // big endian + uint32_t int32; + }; + union s _s; + for (uint64_t i = 0; i < ifmap_size; i++) { + _s.int16[0] = 0; + _s.int16[1] = input_data[i]; + ref_data[i] = _s.int32; + } +} + +static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, + cvk_tg_shape_t *bf16_tg_shape) { + // for calculate size we need in host + cvk_tl_shape_t ifmap_shape = {bf16_tg_shape->n, bf16_tg_shape->c, bf16_tg_shape->h, + bf16_tg_shape->w}; + + // * 2 means fp32 takes twice size of bf16 + cvk_tl_shape_t ofmap_shape = {bf16_tg_shape->n, bf16_tg_shape->c, bf16_tg_shape->h, + bf16_tg_shape->w * 2}; + + uint64_t ifmap_size = tl_shape_size(&ifmap_shape); + uint64_t ofmap_size = tl_shape_size(&ofmap_shape); + + // unit size is 1 bytes, bf16 takes 2 bytes + int data_type_size = 2; + + uint64_t ifmap_bytesize = ifmap_size * data_type_size; + + // * 2 means fp32 takes twice size of bf16 + uint64_t ofmap_bytesize = ofmap_size * data_type_size * 2; + + uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize); + uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize); + + // init input / output data in ddr + init_input((uint16_t *)input_data, ifmap_size); + init_ref((uint16_t *)input_data, (uint32_t *)ref_data, ifmap_size); + + // send host memory->device memory + cvk_fmt_t fmt = CVK_FMT_BF16; + cvk_tg_shape_t fp32_tg_shape; + fp32_tg_shape = {ofmap_shape.n, ofmap_shape.c, ofmap_shape.h, ofmap_shape.w}; + + cvk_tg_t *bf16_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, *bf16_tg_shape, fmt); + test_put_tg_mem_comp(rt_ctx, bf16_tg, (uint8_t *)input_data); + + cvk_tg_t *fp32_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, fp32_tg_shape, fmt); + + // prepare command buffer + cvm_bf16_fp32(cvk_ctx, bf16_tg, fp32_tg); + + // submit descriptor + test_submit_comp(rt_ctx, cvk_ctx); + + // get data from tl + uint8_t *ofmap_data = test_get_tg_mem_comp(rt_ctx, fp32_tg); + + // compare with reference with byte + for (uint32_t i = 0; i < ofmap_size; i++) { + if (ref_data[i] != ofmap_data[i]) { + fprintf(stderr, "comparing failed output[%u] got %u, ref %u\n", i, ofmap_data[i], + ref_data[i]); + // fail case + exit(-1); + } + } + + // free resource from tpu memory + test_free_tg_mem_comp(rt_ctx, bf16_tg); + test_free_tg_mem_comp(rt_ctx, fp32_tg); + + // free resource from host memory + free(input_data); + free(ref_data); + free(ofmap_data); +} + +int main() { + CVI_RT_HANDLE rt_ctx; + cvk_context_t *cvk_ctx; + int round_mode; + + // align kerenl rounding mode + round_mode = set_store_feround(); + + // init runtime / kerenl structure + test_init(&rt_ctx, &cvk_ctx); + + cvk_tg_shape_t bf16_tg_shape = {1, 2, 3, 4}; + { + // test 1 + printf("test bf16 <%d,%d,%d,%d> to fp32\n", bf16_tg_shape.n, bf16_tg_shape.c, bf16_tg_shape.h, + bf16_tg_shape.w); + testbench(&rt_ctx, cvk_ctx, &bf16_tg_shape); + printf("compare test bf16 to fp32 done\n"); + } + + { + // test 2 + bf16_tg_shape = {1, 20, 30, 40}; + printf("test bf16 <%d,%d,%d,%d> to fp32\n", bf16_tg_shape.n, bf16_tg_shape.c, bf16_tg_shape.h, + bf16_tg_shape.w); + testbench(&rt_ctx, cvk_ctx, &bf16_tg_shape); + printf("compare test bf16 to fp32 done\n"); + } + + // de-init runtime / kerenl structure + test_exit(&rt_ctx, cvk_ctx); + + // restore rounding mode + restore_feround(round_mode); + + return 0; +} diff --git a/cvimath/sample/sample_fp32_bf16.cpp b/cvimath/sample/sample_fp32_bf16.cpp new file mode 100644 index 000000000..5e335b8b1 --- /dev/null +++ b/cvimath/sample/sample_fp32_bf16.cpp @@ -0,0 +1,109 @@ +// \file mask sample for gt(great than), ge(great equal), eq(equal), lt(less than), le(less equal) + +// header include +#include +#include // math +#include // kerenl + +void init_input(uint32_t *input_data, uint64_t ifmap_size) { + for (uint64_t i = 0; i < ifmap_size; i++) { + input_data[i] = ((0x1234 + i) << 16) + 0x5678 + i; + } +} + +static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, + cvk_tg_shape_t *fp32_tg_shape) { + // for calculate size we need in host + cvk_tl_shape_t ifmap_shape = {fp32_tg_shape->n, fp32_tg_shape->c, fp32_tg_shape->h, + fp32_tg_shape->w}; + + uint64_t ifmap_size = tl_shape_size(&ifmap_shape); + + // unit size is 1 bytes, bf16 takes 2 bytes + int data_type_size = 2; + + uint64_t ifmap_bytesize = ifmap_size * data_type_size; + uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize); + uint64_t ifmap_bytesize_per_fp32 = ifmap_bytesize / 4; // 4 means float takes 4 bytes + + // init input / output data in ddr + init_input((uint32_t *)input_data, ifmap_bytesize_per_fp32); + + // send host memory->device memory + cvk_fmt_t fmt = CVK_FMT_BF16; + + cvk_tg_t *fp32_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, *fp32_tg_shape, fmt); + test_put_tg_mem_comp(rt_ctx, fp32_tg, (uint8_t *)input_data); + + cvk_tg_shape_t bf16_tg_shape = *fp32_tg_shape; + cvk_tg_t *bf16_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, bf16_tg_shape, fmt); + + // prepare command buffer + cvm_s2s_fp32_bf16(cvk_ctx, fp32_tg->start_address, fp32_tg->shape, bf16_tg->start_address, + bf16_tg->shape, fmt); + + // submit descriptor + test_submit_comp(rt_ctx, cvk_ctx); + + // get data from tl + uint8_t *ofmap_data = test_get_tg_mem_comp(rt_ctx, bf16_tg); + + // compare with reference with byte + uint16_t *ofmap_data_bf16 = (uint16_t *)ofmap_data; + uint32_t *input_data_i32 = (uint32_t *)input_data; + for (uint32_t i = 0; i < ifmap_bytesize_per_fp32; i++) { + uint16_t _input_data_i16 = (input_data_i32[i] >> 16) & 0xffff; + if (_input_data_i16 != ofmap_data_bf16[i]) { + fprintf(stderr, "comparing failed output[%u] got %u, ref %u\n", i, ofmap_data_bf16[i], + _input_data_i16); + // fail case + exit(-1); + } + } + + // free resource from tpu memory + test_free_tg_mem_comp(rt_ctx, bf16_tg); + test_free_tg_mem_comp(rt_ctx, fp32_tg); + + // free resource from host memory + free(input_data); + free(ofmap_data); +} + +int main() { + CVI_RT_HANDLE rt_ctx; + cvk_context_t *cvk_ctx; + int round_mode; + + // align kerenl rounding mode + round_mode = set_store_feround(); + + // init runtime / kerenl structure + test_init(&rt_ctx, &cvk_ctx); + + cvk_tg_shape_t fp32_tg_shape = {1, 2, 3, 4}; + { + // test 1 + printf("test fp32 <%d,%d,%d,%d> to bf16\n", fp32_tg_shape.n, fp32_tg_shape.c, fp32_tg_shape.h, + fp32_tg_shape.w); + testbench(&rt_ctx, cvk_ctx, &fp32_tg_shape); + printf("compare test bf16 to fp32 done\n"); + } + + { + // test 2 + fp32_tg_shape = {1, 20, 30, 40}; + printf("test fp32 <%d,%d,%d,%d> to bf16\n", fp32_tg_shape.n, fp32_tg_shape.c, fp32_tg_shape.h, + fp32_tg_shape.w); + testbench(&rt_ctx, cvk_ctx, &fp32_tg_shape); + printf("compare test bf16 to fp32 done\n"); + } + + // de-init runtime / kerenl structure + test_exit(&rt_ctx, cvk_ctx); + + // restore rounding mode + restore_feround(round_mode); + + return 0; +} diff --git a/cvimath/sample/sample_gemm.cpp b/cvimath/sample/sample_gemm.cpp new file mode 100644 index 000000000..56d515ccf --- /dev/null +++ b/cvimath/sample/sample_gemm.cpp @@ -0,0 +1,312 @@ +// \file sample for gemm(general matrix multiply) + +// header include +#include +#include // math +#include // kerenl + +#include // int gettimeofday +#include /* clock_t, clock, CLOCKS_PER_SEC */ + +typedef cvk_tiu_matrix_multiplication_param_t param_t; + +// comes from +// https://stackoverflow.com/questions/47023651/multiplying-matrices-in-one-dimensional-arrays +void multiply(uint16_t *a, int row1, int col1, uint16_t *b, int row2, int col2, uint16_t *d) { + assert(col1 == row2); + // silence error=unused-but-set-parameter warning + (void)row2; + + for (int i = 0; i < row1; i++) { + for (int j = 0; j < col2; j++) { + float sum = 0; + for (int k = 0; k < col1; k++) { + float _a = convert_bf16_fp32(a[i * col1 + k]); + float _b = convert_bf16_fp32(b[k * col2 + j]); + sum = sum + _a * _b; + } + d[i * col2 + j] = convert_fp32_bf16(sum); + } + } +} + +static void multiply_i32(uint8_t *a, int row1, int col1, uint8_t *b, int row2, int col2, + uint32_t *d, cvk_fmt_t fmt) { + assert(col1 == row2); + // silence error=unused-but-set-parameter warning + (void)row2; + + for (int i = 0; i < row1; i++) { + for (int j = 0; j < col2; j++) { + int sum = 0; + for (int k = 0; k < col1; k++) { + int _a = fmt == CVK_FMT_I8 ? (int8_t)(a[i * col1 + k]) : (a[i * col1 + k]); + int _b = fmt == CVK_FMT_I8 ? (int8_t)(b[k * col2 + j]) : (b[k * col2 + j]); + sum = sum + _a * _b; + } + d[i * col2 + j] = (sum); + } + } +} + +// compare with uint16_t type +int array_cmp_int16(const char *const info, const uint16_t *p_exp, const uint16_t *p_got, + int count) { + int idx; + for (idx = 0; idx < count; idx++) { + if (p_exp[idx] != p_got[idx]) { + printf("%s error at index %d exp %d(%f,0x%x) got %d(%f,0x%x)\n", info, idx, p_exp[idx], + convert_bf16_fp32(p_exp[idx]), p_exp[idx], p_got[idx], convert_bf16_fp32(p_got[idx]), + p_got[idx]); + return -1; + } + } + return 0; +} + +static int array_cmp_int32(const char *const info, const uint32_t *p_exp, const uint32_t *p_got, + int count) { + int idx; + for (idx = 0; idx < count; idx++) { + if (p_exp[idx] != p_got[idx]) { + printf("%s error at index %d exp %d got %d\n", info, idx, p_exp[idx], p_got[idx]); + return -1; + } + } + return 0; +} + +static cvk_mg_t *_test_put_matrix_g(CVI_RT_HANDLE *rt_ctx, size_t row, size_t col, + cvk_fmt_t mg_data_format, uint8_t data[]) { + cvk_mg_shape_t s; + s.row = row; + s.col = col; + return test_put_matrix_g(rt_ctx, s, mg_data_format, data); +} + +static void assign_bf16_values_to_matrix(uint16_t *matrix, size_t size) { + float t; + for (size_t i = 0; i < size; i++) { + float f; +#if 1 + // simple pattern + if (i % 2 == 0) t = i % 8; + if (i % 2 == 1) t = -1 * (i % 8); + f = t; +#else + t = i * (i % 2 ? -1 : 1); + f = t * 0.01 + size * 0.01; +#endif + matrix[i] = convert_fp32_bf16(f); + } +} + +static void assign_i8_values_to_matrix(uint8_t *matrix, size_t size) { + for (size_t i = 0; i < size; i++) { + matrix[i] = i + 20; + } +} + +static int test_gemm_bf16(size_t M, size_t N, size_t K) { + long elapsed; + struct timeval t0, t1; + int ret = 0; + + // alloc test data in host + uint16_t *bf16_A = new uint16_t[M * K]; + uint16_t *bf16_B = new uint16_t[N * K]; + uint16_t *bf16_R = new uint16_t[2 * M * N]; + uint16_t *int16_C_ref = new uint16_t[M * N]; + + // assign data + assign_bf16_values_to_matrix(bf16_A, M * K); + assign_bf16_values_to_matrix(bf16_B, N * K); + + gettimeofday(&t0, NULL); + + multiply(bf16_A, M, K, bf16_B, K, N, int16_C_ref); + + gettimeofday(&t1, NULL); + elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; + printf("CPU GEMM takes %ld us\n", elapsed); + + CVI_RT_HANDLE ctx; + cvk_context_t *bk_ctx; + + // init runtime / kerenl structure + test_init(&ctx, &bk_ctx); + + // alloc device memory and put data to device + cvk_mg_t *mg_A = _test_put_matrix_g(&ctx, M, K, CVK_FMT_BF16, (uint8_t *)bf16_A); + cvk_mg_t *mg_B = _test_put_matrix_g(&ctx, K, N, CVK_FMT_BF16, (uint8_t *)bf16_B); + cvk_mg_t *mg_R = _test_put_matrix_g(&ctx, M * 2, N, CVK_FMT_BF16, (uint8_t *)bf16_R); + + // get device address for gemm + gaddr_t gaddr_a = mg_A->start_address; + gaddr_t gaddr_b = mg_B->start_address; + gaddr_t gaddr_r = mg_R->start_address; + + // prepare gemm descriptor + size_t *slice_num = + cvm_gemm((cvk_context_t *)bk_ctx, gaddr_a, gaddr_b, gaddr_r, M, K, N, CVK_FMT_BF16); + + // submit descriptor + gettimeofday(&t0, NULL); + test_submit_comp(&ctx, bk_ctx); + gettimeofday(&t1, NULL); + elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; + + printf("GEMM takes %ld us\n", elapsed); + + // get result from device to host + uint16_t *bf16_ref = (uint16_t *)test_get_mg_mem_comp(&ctx, mg_R); + + // compare, exit once compare fail in + int cmp_res = array_cmp_int16("gemm", int16_C_ref, bf16_ref, M * N); + if (cmp_res != 0) { + ret = -1; + printf("Comparison failed for cblas_sgemm and bmblas_gemm!\n"); + } + + // free device resource + test_free_mg_mem_comp(&ctx, mg_A); + test_free_mg_mem_comp(&ctx, mg_B); + test_free_mg_mem_comp(&ctx, mg_R); + + // de-init runtime / kerenl structure + test_exit(&ctx, bk_ctx); + + // free resource from host + delete[] bf16_A; + delete[] bf16_B; + delete[] bf16_R; + delete[] int16_C_ref; + free(bf16_ref); + free(slice_num); + + return ret; +} + +static int test_gemm_i8(size_t M, size_t N, size_t K, cvk_fmt_t fmt) { + long elapsed; + struct timeval t0, t1; + int ret = 0; + + // 4 means 32bit takes 4 times size of uint8_t + int uint32_per_uint8 = sizeof(uint32_t) / sizeof(uint8_t); + + // alloc test data in host + uint8_t *i8_A = new uint8_t[M * K]; + uint8_t *i8_B = new uint8_t[N * K]; + uint8_t *i8_R = new uint8_t[uint32_per_uint8 * M * N]; + uint32_t *int32_C_ref = new uint32_t[M * N]; + + // assign data + assign_i8_values_to_matrix(i8_A, M * K); + assign_i8_values_to_matrix(i8_B, N * K); + + // measure cpu time + gettimeofday(&t0, NULL); + + multiply_i32(i8_A, M, K, i8_B, K, N, int32_C_ref, fmt); + + gettimeofday(&t1, NULL); + + elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; + printf("CPU GEMM takes %ld us\n", elapsed); + + // alloc runtime + CVI_RT_HANDLE ctx; + cvk_context_t *bk_ctx; + + // init runtime / kerenl structure + test_init(&ctx, &bk_ctx); + + // alloc device memory and put data to device + cvk_mg_t *mg_A = _test_put_matrix_g(&ctx, M, K, CVK_FMT_I8, (uint8_t *)i8_A); + cvk_mg_t *mg_B = _test_put_matrix_g(&ctx, K, N, CVK_FMT_I8, (uint8_t *)i8_B); + cvk_mg_t *mg_R = _test_put_matrix_g(&ctx, M * uint32_per_uint8, N, CVK_FMT_I8, (uint8_t *)i8_R); + + // get device address for gemm + gaddr_t gaddr_a = mg_A->start_address; + gaddr_t gaddr_b = mg_B->start_address; + gaddr_t gaddr_r = mg_R->start_address; + + // prepare gemm descriptor + size_t *slice_num = cvm_gemm((cvk_context_t *)bk_ctx, gaddr_a, gaddr_b, gaddr_r, M, K, N, fmt); + + gettimeofday(&t0, NULL); + + // submit descriptor + test_submit_comp(&ctx, bk_ctx); + + gettimeofday(&t1, NULL); + + elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; + printf("GEMM takes %ld us\n", elapsed); + + // get result from device to host + uint8_t *i8_R_host = (uint8_t *)test_get_mg_mem_comp(&ctx, mg_R); + + // for re-combine + uint32_t *i32_C = new uint32_t[M * N]; + + if (fmt == CVK_FMT_I8) { + cvm_combin_gemm_i8(slice_num, i8_R_host, i32_C, M, N); + } + + free(slice_num); + + // compare, exit once compare fail in + int cmp_res = array_cmp_int32("gemm", int32_C_ref, i32_C, M * N); + if (cmp_res != 0) { + ret = -1; + printf("Comparison failed for cblas_sgemm and bmblas_gemm!\n"); + } + + // free device resource + test_free_mg_mem_comp(&ctx, mg_A); + test_free_mg_mem_comp(&ctx, mg_B); + test_free_mg_mem_comp(&ctx, mg_R); + + // de-init runtime / kerenl structure + test_exit(&ctx, bk_ctx); + + // free resource from host + delete[] i8_A; + delete[] i8_B; + delete[] i8_R; + delete[] int32_C_ref; + delete[] i32_C; + free(i8_R_host); + + return ret; +} + +static int test_gemm(size_t M, size_t N, size_t K, cvk_fmt_t fmt) { + printf("%s: M=%zu, N=%zu, K=%zu\n", __func__, M, N, K); + if (fmt == CVK_FMT_BF16) { + return test_gemm_bf16(M, N, K); + } else { + return test_gemm_i8(M, N, K, fmt); + } +} + +int main() { + int round_mode; + // align backend rounding + round_mode = set_store_feround(); + + if (0 != test_gemm(3, 500, 512, CVK_FMT_BF16)) exit(-1); + if (0 != test_gemm(1, 20000, 512, CVK_FMT_I8)) exit(-1); + + // heavy test + // if (0 != test_gemm(300, 500, 512, CVK_FMT_BF16)) exit(-1); + + printf("Comparison done for cpu gemm and tpu gemm!\n\n"); + + // restore rounding + restore_feround(round_mode); + + return 0; +} diff --git a/cvimath/sample/sample_mask.cpp b/cvimath/sample/sample_mask.cpp new file mode 100644 index 000000000..fdf90e19d --- /dev/null +++ b/cvimath/sample/sample_mask.cpp @@ -0,0 +1,175 @@ +// \file mask sample for gt(great than), ge(great equal), eq(equal), lt(less than), le(less equal) + +// header include +#include +#include // math +#include // kerenl + +// global variable for loop all test case +static enum CVM_MASK_TYPE mode; + +// global structure for test +struct pattern { + float *input; // input + float *ref; // reference output + int len; // data lenth +#define HELP_LEN (10) + char help[HELP_LEN]; // help message +}; + +// input +float cvm_mask_type_gt_0_input[] = {-1 * pow(2, -62), -0.003, -1.0, -100000, 0.000001, 1, 1000, + pow(2, 62), 0}; + +// ref, 0 means false, 1 means true +float cvm_mask_type_gt_0_output[] = {0, 0, 0, 0, 1, 1, 1, 1, 0}; +float cvm_mask_type_ge_0_output[] = {0, 0, 0, 0, 1, 1, 1, 1, 1}; +float cvm_mask_type_eq_0_output[] = {0, 0, 0, 0, 0, 0, 0, 0, 1}; +float cvm_mask_type_lt_0_output[] = {1, 1, 1, 1, 0, 0, 0, 0, 0}; +float cvm_mask_type_le_0_output[] = {1, 1, 1, 1, 0, 0, 0, 0, 1}; + +// size of input +int input_sz = sizeof(cvm_mask_type_gt_0_input) / sizeof(cvm_mask_type_gt_0_input[0]); + +// init test case +static struct pattern patterns[] = { + {cvm_mask_type_gt_0_input, cvm_mask_type_gt_0_output, input_sz, "gt test"}, + {cvm_mask_type_gt_0_input, cvm_mask_type_ge_0_output, input_sz, "ge test"}, + {cvm_mask_type_gt_0_input, cvm_mask_type_eq_0_output, input_sz, "eq test"}, + {cvm_mask_type_gt_0_input, cvm_mask_type_lt_0_output, input_sz, "lt test"}, + {cvm_mask_type_gt_0_input, cvm_mask_type_le_0_output, input_sz, "le test"}, +}; + +static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) { + // default test bf16 case + cvk_fmt_t fmt = CVK_FMT_BF16; + + struct pattern *p = &patterns[mode]; + + // alloc shape, align with \len + uint32_t input_n = 1; + uint32_t input_c = 1; + uint32_t input_h = 1; + uint32_t input_w = p->len; + + cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w}; + cvk_tl_shape_t ofmap_shape = ifmap_shape; + + uint64_t ifmap_size = tl_shape_size(&ifmap_shape); + uint64_t ofmap_size = tl_shape_size(&ofmap_shape); + + // unit size is 1 bytes, bf16 takes 2 bytes + int data_type_size = 1; + if (fmt == CVK_FMT_BF16) { + data_type_size = 2; + } + + uint64_t ifmap_bytesize = ifmap_size * data_type_size; + uint64_t ofmap_bytesize = ofmap_size * data_type_size; + + // get table shape + cvk_tl_shape_t table_shape; + uint64_t table_bytesize = cvm_lut_tbl_bytesize(bmk, &table_shape, fmt); + + // alloc input/output tl + cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, CTRL_AL); + cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, CTRL_AL); + + // alloc lookup table + cvk_tl_t *tl_pos_neg_buf = test_alloc_tl(bmk, table_shape, fmt, CTRL_AL); + cvk_tl_t *tl_0_idx_table = test_alloc_tl(bmk, table_shape, fmt, CTRL_AL); + + // alloc tmp tl + cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, CTRL_AL); + cvk_tl_t *tl_buf2 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, CTRL_AL); + cvk_tl_t *tl_buf4 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, CTRL_AL); + + // alloc data from ddr + uint16_t *input_data = (uint16_t *)xmalloc(ifmap_bytesize); + uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize); + uint16_t *table_data_atan_pos_neg = (uint16_t *)xmalloc(table_bytesize); + uint16_t *idx_0_table_data = (uint16_t *)xmalloc(table_bytesize); + + // init lookup table data in ddr + cvm_gen_0_tbl(idx_0_table_data, &table_shape); + cvm_pos_neg_tbl(table_data_atan_pos_neg, &table_shape); + + // init input / output data in ddr + for (uint32_t i = 0; i < ifmap_size; i++) { + input_data[i] = convert_fp32_bf16(p->input[i]); + ref_data[i] = convert_fp32_bf16(p->ref[i]); + } + + // send ddr data to tl + test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)input_data); + test_put_tensor_g2l_comp(ctx, bmk, tl_pos_neg_buf, (uint8_t *)table_data_atan_pos_neg); + test_put_tensor_g2l_comp(ctx, bmk, tl_0_idx_table, (uint8_t *)idx_0_table_data); + + // emit mask function + cvm_emit_mask(bmk, + tl_ifmap, // input + tl_buf, tl_buf2, tl_buf4, // tmp buffer + tl_pos_neg_buf, tl_0_idx_table, // lookup table + tl_ofmap_bf16, // output + fmt, mode); + + // submit descriptor + test_submit_comp(ctx, bmk); + + // get data from tl + uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, tl_ofmap_bf16); + + // compare with reference + for (uint32_t i = 0; i < ifmap_size; i++) { + if (ref_data[i] != ofmap_data[i]) { + fprintf(stderr, "comparing failed at mode (%s) output[%u] got %f(0x%x), ref %f(0x%x)\n", + p->help, i, convert_bf16_fp32(ofmap_data[i]), ofmap_data[i], + convert_bf16_fp32(ref_data[i]), ref_data[i]); + // fail case + exit(-1); + } + } + + // free resource from kernel + free_tl(bmk, tl_buf4); + free_tl(bmk, tl_buf2); + free_tl(bmk, tl_buf); + free_tl(bmk, tl_0_idx_table); + free_tl(bmk, tl_pos_neg_buf); + free_tl(bmk, tl_ofmap_bf16); + free_tl(bmk, tl_ifmap); + + // free resource from heap + free(input_data); + free(ref_data); + free(ofmap_data); + free(table_data_atan_pos_neg); + free(idx_0_table_data); +} + +int main() { + CVI_RT_HANDLE ctx; + cvk_context_t *bmk; + int round_mode; + + // align kerenl rounding mode + round_mode = set_store_feround(); + + // init runtime / kerenl structure + test_init(&ctx, &bmk); + + for (int i = CVM_MASK_TYPE_GT_0; i < CVM_MASK_MAX; i++) { + mode = static_cast(i); + struct pattern *p = &patterns[mode]; + printf("test %s...\n", p->help); + testbench(&ctx, bmk); + } + + // de-init runtime / kerenl structure + test_exit(&ctx, bmk); + + // restore rounding mode + restore_feround(round_mode); + + return 0; +} diff --git a/cvimath/sample/sample_reduce_mul.cpp b/cvimath/sample/sample_reduce_mul.cpp new file mode 100644 index 000000000..5a31d14cc --- /dev/null +++ b/cvimath/sample/sample_reduce_mul.cpp @@ -0,0 +1,160 @@ +// \file mask sample for gt(great than), ge(great equal), eq(equal), lt(less than), le(less equal) + +// header include +#include +#include // math +#include // kerenl + +void init_input(uint8_t *input_data, uint64_t ifmap_bytesize, cvk_fmt_t fmt) { + uint32_t fmt_size = cvm_bytesize_of_fmt(fmt); + uint64_t sz = ifmap_bytesize / fmt_size; + int round = 4; // random + for (uint64_t i = 0; i < sz; i++) { + uint8_t r[2]; + r[0] = i % round; + if (r[0] == 0) { + r[0] = 1; // prevent mul to 0 + } + + if (fmt_size == 2) { + // bf16 + uint16_t bf16 = convert_fp32_bf16((float)r[0]); + memcpy(r, &bf16, fmt_size); + } + memcpy(&input_data[i * fmt_size], r, fmt_size); + } +} + +void init_ref(uint8_t *input_data, uint8_t *ref_data, cvk_tl_shape_t *ifmap_shape, cvk_fmt_t fmt) { + uint32_t fmt_size = cvm_bytesize_of_fmt(fmt); + int ref_idx = 0; + + // reduce ONLY hw + for (uint32_t n = 0; n < ifmap_shape->n; n++) { + for (uint32_t c = 0; c < ifmap_shape->c; c++) { + float tmp = 1; + for (uint32_t h = 0; h < ifmap_shape->h; h++) { + for (uint32_t w = 0; w < ifmap_shape->w; w++) { + uint32_t off = (n * ifmap_shape->c * ifmap_shape->h * ifmap_shape->w + + c * ifmap_shape->h * ifmap_shape->w + h * ifmap_shape->w + w) * + fmt_size; + float v; + if (fmt_size == 2) { + // bf16 case + uint16_t bf16; + memcpy(&bf16, &input_data[off], fmt_size); + v = convert_bf16_fp32(bf16); + } else { + v = input_data[off]; + } + tmp = v * tmp; + } + } + uint8_t r[2]; + if (fmt_size == 2) { + // bf16 case + uint16_t bf16 = convert_fp32_bf16(tmp); + memcpy(r, (void *)&bf16, fmt_size); + } else { + r[0] = tmp; + } + memcpy(&ref_data[ref_idx * fmt_size], r, fmt_size); + ref_idx++; + } + } +} + +static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, cvk_fmt_t fmt) { + // alloc shape, align with \len + uint32_t input_n = 1; + uint32_t input_c = 3; + uint32_t input_h = 2; + uint32_t input_w = 2; + + cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w}; + // NOTICE: ONLY reduce hw for performance + cvk_tl_shape_t ofmap_shape = {input_n, input_c, 1, 1}; + + uint64_t ifmap_size = tl_shape_size(&ifmap_shape); + uint64_t ofmap_size = tl_shape_size(&ofmap_shape); + + // unit size is 1 bytes, bf16 takes 2 bytes + int data_type_size = 1; + if (fmt == CVK_FMT_BF16) { + data_type_size = 2; + } + + uint64_t ifmap_bytesize = ifmap_size * data_type_size; + uint64_t ofmap_bytesize = ofmap_size * data_type_size; + + // alloc input/output tl + cvk_tl_t *tl_ifmap = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, CTRL_AL); + + // alloc data from ddr + uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize); + uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize); + + // init input / output data in ddr + init_input(input_data, ifmap_bytesize, fmt); + init_ref(input_data, ref_data, &ifmap_shape, fmt); + + // send host memory->device memory->tpu_memory + test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ifmap, (uint8_t *)input_data); + + // prepare command buffer + cvm_reduce_hw_mul(cvk_ctx, tl_ifmap); + + // submit descriptor + test_submit_comp(rt_ctx, cvk_ctx); + + // reshape for reduce result + tl_ifmap->shape = {tl_ifmap->shape.n, tl_ifmap->shape.c, 1, 1}; + tl_ifmap->stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_ifmap->shape, tl_ifmap->fmt, 1); + + // get data from tl + uint8_t *ofmap_data = test_get_tensor_l2g_comp(rt_ctx, cvk_ctx, tl_ifmap); + + // compare with reference with byte + for (uint32_t i = 0; i < ofmap_size; i++) { + if (ref_data[i] != ofmap_data[i]) { + fprintf(stderr, "comparing failed output[%u] got %u, ref %u\n", i, ofmap_data[i], + ref_data[i]); + // fail case + exit(-1); + } + } + + // free resource from tpu memory + free_tl(cvk_ctx, tl_ifmap); + + // free resource from host memory + free(input_data); + free(ref_data); + free(ofmap_data); +} + +int main() { + CVI_RT_HANDLE rt_ctx; + cvk_context_t *cvk_ctx; + int round_mode; + + // align kerenl rounding mode + round_mode = set_store_feround(); + + // init runtime / kerenl structure + test_init(&rt_ctx, &cvk_ctx); + + printf("test reduce mul int8\n"); + testbench(&rt_ctx, cvk_ctx, CVK_FMT_I8); + + printf("test reduce mul bf16\n"); + testbench(&rt_ctx, cvk_ctx, CVK_FMT_BF16); + + // de-init runtime / kerenl structure + test_exit(&rt_ctx, cvk_ctx); + + // restore rounding mode + restore_feround(round_mode); + + return 0; +} diff --git a/cvimath/sample/sample_set_val_by_mask.cpp b/cvimath/sample/sample_set_val_by_mask.cpp new file mode 100644 index 000000000..2d5d1b94c --- /dev/null +++ b/cvimath/sample/sample_set_val_by_mask.cpp @@ -0,0 +1,656 @@ +// \file sample for set value by mask, plz refer \cvimath_internal.h for more details + +// header include +#include +#include // math +#include // kerenl + +#include // int gettimeofday +#include /* clock_t, clock, CLOCKS_PER_SEC */ + +#define DEBUG 1 // < 0 is disable debug +#define debug_print(fmt, ...) \ + do { \ + if (DEBUG) fprintf(stderr, fmt, __VA_ARGS__); \ + } while (0) + +int flip = 0; +struct testbench { + char *name; + int (*cvm_run)(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf, + cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, + uint8_t threshold, uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap); + void (*ref)(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask, uint8_t *pNewY, uint8_t *pY, + uint8_t *g_update_tbl, uint8_t threshold, uint8_t w1, uint8_t w2); + uint8_t threshold; + uint8_t w1; + uint8_t w2; +}; + +static void init_kernel(uint8_t *kernel_data, uint64_t kernel_size, int8_t val) { + int8_t *kernel_data_i8 = (int8_t *)kernel_data; + for (uint64_t i = 0; i < kernel_size; i++) { + kernel_data_i8[i] = val; + } +} + +static void init_bias(uint8_t *bias_data, uint64_t bias_size, int16_t val) { + int c = bias_size / 2; + + for (int i = 0; i < c; i++) { + bias_data[i] = val & 0xff; + bias_data[i + c] = (val >> 8) & 0xff; + } +} + +static void init_input_2(uint8_t *input_data, uint64_t ifmap_size) { + for (uint64_t i = 0; i < ifmap_size; i++) { + input_data[i] = i * 2 * (i % 3 ? -1 : 1); + } +} + +static void init_input_3(uint8_t *input_data, uint64_t ifmap_size) { + for (uint64_t i = 0; i < ifmap_size; i++) { + input_data[i] = i * 3; + } +} + +static void init_mask(uint8_t *mask, uint64_t ifmap_size) { + for (uint64_t i = 0; i < ifmap_size; i++) { + mask[i] = i % 2; + } +} + +static void init_update_tbl(uint8_t *update_tbl, uint64_t ifmap_size) { + int8_t *update_tbl_i8 = (int8_t *)update_tbl; + for (uint64_t i = 0; i < ifmap_size; i++) { + update_tbl_i8[i] = i * (i % 2 ? -1 : 1); + } +} + +static void init_ref(uint8_t *ref_data, uint64_t ofmap_size) { + for (uint64_t i = 0; i < ofmap_size; i++) { + ref_data[i] = -1 * i; + // ref_data[i] = 3 * i; + } +} + +static void set_image_by_u8mask(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask, + uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl, + uint8_t threshold, uint8_t w1, uint8_t w2) { + (void)pY; + (void)g_update_tbl; + (void)threshold; + (void)w1; + (void)w2; + + for (size_t i = 0; i < ifmap_size; i++) { + if (mask[i]) { + ref_data[i] = pNewY[i]; + } + } +} + +static void set_image_by_two_info_i8(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask, + uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl, + uint8_t threshold, uint8_t w1, uint8_t w2) { + (void)pY; + (void)w1; + (void)w2; + int8_t *g_update_tbl_i8 = (int8_t *)g_update_tbl; + + for (size_t i = 0; i < ifmap_size; i++) { + if (mask[i] && (g_update_tbl_i8[i] < threshold)) { + ref_data[i] = pNewY[i]; + } + } +} + +static void gen_image_diff(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask, uint8_t *pNewY, + uint8_t *pY, uint8_t *g_update_tbl, uint8_t threshold, uint8_t w1, + uint8_t w2) { + (void)mask; + (void)w1; + (void)w2; + (void)g_update_tbl; + (void)threshold; + + for (size_t i = 0; i < ifmap_size; i++) { + ref_data[i] = abs(pNewY[i] - pY[i]); + } +} + +static void update_tbl_by_threshold(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask, + uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl, + uint8_t threshold, uint8_t w1, uint8_t w2) { + (void)pNewY; + (void)pY; + (void)g_update_tbl; + (void)mask; + (void)w2; + int8_t *ref_data_i8 = (int8_t *)ref_data; // output is i8 + + for (size_t i = 0; i < ifmap_size; i++) { + mask[i] = 0; + } + + for (size_t i = 0; i < ifmap_size; i++) { + int8_t old = ref_data_i8[i]; + if (g_update_tbl[i] < threshold) { + ref_data_i8[i] = (ref_data_i8[i] < w1) ? 0 : (ref_data_i8[i] - 1); + } else { + if (old != 127) { + // saturate it + ref_data_i8[i]++; + } + mask[i] = 1; + } + } +} + +static void set_image_by_two_info_u8(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask, + uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl, + uint8_t threshold, uint8_t w1, uint8_t w2) { + (void)pY; + (void)mask; + (void)w1; + (void)w2; + // int8_t* g_update_tbl_i8 = (int8_t*)g_update_tbl; + + for (size_t i = 0; i < ifmap_size; i++) { + if (g_update_tbl[i] >= threshold) { + ref_data[i] = pNewY[i]; + } + } +} + +static void blend_image_by_tbl(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask, + uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl, + uint8_t threshold, uint8_t w1, uint8_t w2) { + (void)mask; + (void)pY; + int8_t *g_update_tbl_i8 = (int8_t *)g_update_tbl; + for (size_t i = 0; i < ifmap_size; i++) { + if (g_update_tbl_i8[i] > threshold) { + ref_data[i] = (w1 * ref_data[i] + w2 * pNewY[i]) >> 8; + } + } +} + +static int _cvm_set_image_by_u8mask(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2, + cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl, + cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, uint8_t threshold, + uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap) { + (void)tl_ifmap2; + (void)tl_update_tbl; + (void)threshold; + (void)w1; + (void)w2; + (void)tl_kernel; + (void)tl_bias; + (void)tl_buf; + + return cvm_set_image_by_u8mask(ctx, tl_ifmap, tl_buf, tl_mask, tl_ofmap); +} + +static int _cvm_set_image_by_u8mask_dp(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2, + cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl, + cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, uint8_t threshold, + uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap) { + (void)tl_ifmap2; + (void)tl_update_tbl; + (void)threshold; + (void)w1; + (void)w2; + (void)tl_kernel; + (void)tl_bias; + (void)tl_buf; + + return cvm_set_image_by_u8mask_dp(ctx, tl_ifmap, tl_mask, tl_kernel, tl_bias, tl_ofmap); +} + +static int _cvm_set_image_by_two_info_i8(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, + cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, + cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_kernel, + cvk_tl_t *tl_bias, uint8_t threshold, uint8_t w1, + uint8_t w2, cvk_tl_t *tl_ofmap) { + (void)tl_ifmap2; + (void)threshold; + (void)w1; + (void)w2; + (void)tl_kernel; + (void)tl_bias; + + // tl_ifmap2 as buf + return cvm_set_image_by_two_info_i8(ctx, tl_ifmap, tl_buf, tl_mask, tl_update_tbl, threshold, + tl_ofmap); +} + +static int _cvm_set_image_by_two_info_i8_dp(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, + cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf, + cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl, + cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, + uint8_t threshold, uint8_t w1, uint8_t w2, + cvk_tl_t *tl_ofmap) { + (void)tl_ifmap2; + (void)threshold; + (void)w1; + (void)w2; + (void)tl_kernel; + (void)threshold; + (void)tl_buf; + + return cvm_set_image_by_two_info_i8_dp(ctx, tl_ifmap, tl_kernel, tl_mask, tl_update_tbl, tl_bias, + tl_ofmap); +} + +static int _cvm_gen_image_diff(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2, + cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl, + cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, uint8_t threshold, + uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap) { + (void)tl_mask; + (void)tl_buf; + (void)tl_update_tbl; + (void)threshold; + (void)w1; + (void)w2; + (void)tl_kernel; + (void)tl_bias; + + // tl_mask as buffer + return cvm_gen_image_diff(ctx, tl_ifmap, tl_ifmap2, tl_mask, tl_buf, tl_ofmap); +} + +static int _cvm_update_tbl_by_threshold(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2, + cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, + cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_kernel, + cvk_tl_t *tl_bias, uint8_t threshold, uint8_t w1, + uint8_t w2, cvk_tl_t *tl_ofmap) { + (void)w2; + (void)tl_kernel; + (void)tl_bias; + + // w1 as threshold_b, tl_ifmap/tl_ifmap2 as buf + return cvm_update_tbl_by_threshold(ctx, tl_mask, tl_ifmap, tl_ifmap2, tl_buf, tl_update_tbl, + threshold, w1, tl_ofmap); +} + +static int _cvm_set_image_by_two_info_u8(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, + cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, + cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_kernel, + cvk_tl_t *tl_bias, uint8_t threshold, uint8_t w1, + uint8_t w2, cvk_tl_t *tl_ofmap) { + (void)tl_ifmap2; + (void)tl_mask; + (void)w1; + (void)w2; + (void)tl_kernel; + (void)tl_bias; + + // tl_ifmap2 as buf + return cvm_set_image_by_two_info_u8(ctx, tl_ifmap, tl_ifmap2, tl_buf, tl_update_tbl, threshold, + tl_ofmap); +} + +static int _cvm_blend_image_by_tbl(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2, + cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl, + cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, uint8_t threshold, + uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap) { + (void)tl_ifmap2; + (void)tl_kernel; + (void)tl_bias; + // tl_mask as buf + return cvm_blend_image_by_tbl(ctx, tl_ifmap, tl_mask, tl_buf, tl_update_tbl, threshold, w1, w2, + tl_ofmap); +} + +struct testbench testbenchs[] = { + {(char *)"set_image_by_two_info_i8_dp", _cvm_set_image_by_two_info_i8_dp, + set_image_by_two_info_i8, 2, 2, 3}, + {(char *)"set_image_by_u8mask_dp", _cvm_set_image_by_u8mask_dp, set_image_by_u8mask, 10, 2, 3}, + + {(char *)"set_image_by_u8mask", _cvm_set_image_by_u8mask, set_image_by_u8mask, 10, 2, 3}, + {(char *)"set_image_by_two_info_i8", _cvm_set_image_by_two_info_i8, set_image_by_two_info_i8, 2, + 2, 3}, + {(char *)"update_tbl_by_threshold", _cvm_update_tbl_by_threshold, update_tbl_by_threshold, 15, + 12, 3}, + {(char *)"gen_image_diff", _cvm_gen_image_diff, gen_image_diff, 10, 2, 3}, + {(char *)"set_image_by_two_info_u8", _cvm_set_image_by_two_info_u8, set_image_by_two_info_u8, + 40, 2, 3}, + {(char *)"blend_image_by_tbl", _cvm_blend_image_by_tbl, blend_image_by_tbl, 6, 2, 3}, +}; + +static void load(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap2, + uint8_t *input_ifmap2, cvk_tl_t *tl_ifmap3, uint8_t *input_ifmap3, + cvk_tl_t *tl_ofmap, uint8_t *input_ofmap, cvk_tl_t *tl_mask, uint8_t *input_mask, + cvk_tl_t *tl_update_tbl, uint8_t *input_update_tbl) { + // send device memory to sram + test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ifmap2, input_ifmap2); + test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ifmap3, input_ifmap3); + test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_mask, input_mask); + test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_update_tbl, input_update_tbl); + test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ofmap, input_ofmap); +} + +static void store(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, char *name, cvk_tl_t *tl_ofmap, + uint8_t *output_ofmap, cvk_tl_t *tl_mask, uint8_t *output_mask, int sz) { + uint8_t *ofmap_data = (uint8_t *)test_get_tensor_l2g_comp(rt_ctx, cvk_ctx, tl_ofmap); + + // NOTICE: heavy copy + memcpy(output_ofmap, ofmap_data, sz); + + free(ofmap_data); + + if (!strcmp(name, "update_tbl_by_threshold")) { + uint8_t *mask_data = (uint8_t *)test_get_tensor_l2g_comp(rt_ctx, cvk_ctx, tl_mask); + memcpy(output_mask, mask_data, sz); + free(mask_data); + } +} + +static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, cvk_tg_shape_t *tg_shape, + int testcase_idx, int is_pingpong = false) { + // for calculate size we need in host + cvk_tl_shape_t ifmap_shape = {tg_shape->n, tg_shape->c, tg_shape->h, tg_shape->w}; + cvk_tl_shape_t ofmap_shape = ifmap_shape; + + uint64_t ifmap_size = tl_shape_size(&ifmap_shape); + uint64_t ofmap_size = tl_shape_size(&ofmap_shape); + + // unit size is 1 bytes + int data_type_size = 1; + + // get input/output size + uint64_t ifmap_bytesize = ifmap_size * data_type_size; + uint64_t ofmap_bytesize = ofmap_size * data_type_size; + + // alloc on ddr + // uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize); + uint8_t *input_data2 = (uint8_t *)xmalloc(ifmap_bytesize); + uint8_t *input_data3 = (uint8_t *)xmalloc(ifmap_bytesize); + uint8_t *mask = (uint8_t *)xmalloc(ifmap_bytesize); + uint8_t *update_tbl = (uint8_t *)xmalloc(ifmap_bytesize); + uint8_t *_ref_data = (uint8_t *)xmalloc(ofmap_bytesize); + uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize); + uint8_t *tpu_output_data = (uint8_t *)xmalloc(ofmap_bytesize); + uint8_t *tpu_output_mask = (uint8_t *)xmalloc(ofmap_bytesize); + + // init input / output data in ddr + uint8_t threshold, w1, w2; + threshold = testbenchs[testcase_idx].threshold; + w1 = testbenchs[testcase_idx].w1; + w2 = testbenchs[testcase_idx].w2; + init_input_2(input_data2, ifmap_size); + init_input_3(input_data3, ifmap_size); + // init_input(input_data2, ifmap_size); + // init_input(input_data3, ifmap_size); + init_mask(mask, ifmap_size); + init_update_tbl(update_tbl, ifmap_size); + init_ref(ref_data, ofmap_size); + + // keep org output + memcpy(_ref_data, ref_data, ofmap_bytesize); + + testbenchs[testcase_idx].ref(ref_data, ofmap_size, mask, input_data2, input_data3, update_tbl, + threshold, w1, w2); + + int tiles = std::ceil(ifmap_shape.c / (float)cvk_ctx->info.npu_num); + + ifmap_shape.c = ifmap_shape.c / tiles; + + cvk_tl_shape_t kernel_shape = ifmap_shape; + kernel_shape.h = 1; + kernel_shape.w = 1; + + cvk_tl_shape_t bias_shape = ifmap_shape; + bias_shape.h = 1; + bias_shape.w = 1; + bias_shape.n = 2; + + uint64_t kernel_size = tl_shape_size(&kernel_shape); + uint64_t bias_size = tl_shape_size(&bias_shape); + uint64_t kernel_bytesize = kernel_size * data_type_size; + uint64_t bias_bytesize = bias_size * data_type_size; + uint8_t *kernel_data = (uint8_t *)xmalloc(kernel_bytesize); + uint8_t *bias_data = (uint8_t *)xmalloc(bias_bytesize); + + // NOTICE: must init with it + init_kernel(kernel_data, kernel_size, -1); + init_bias(bias_data, bias_size, 1); + + if (!strcmp(testbenchs[testcase_idx].name, "set_image_by_two_info_i8_dp")) { + init_kernel(kernel_data, kernel_size, 1); + init_bias(bias_data, bias_size, -1 * threshold); + } + + if (is_pingpong) { + // quirk that we tile h for easy implemenetation + ifmap_shape.h /= 2; + tiles *= 2; + } + + // sync input/output + ofmap_shape = ifmap_shape; + + // NOTICE: dont care batch + int shape_sz = ifmap_shape.c * ifmap_shape.h * ifmap_shape.w; + + // alloc on sram, just once + cvk_fmt_t fmt = CVK_FMT_U8; // for mac used + int eu_align = 1; // dont care + cvk_tl_t *tl_ifmap2[2] = {NULL, NULL}; + cvk_tl_t *tl_ifmap3[2] = {NULL, NULL}; + cvk_tl_t *tl_ofmap[2] = {NULL, NULL}; + cvk_tl_t *tl_mask[2] = {NULL, NULL}; + cvk_tl_t *tl_update_tbl[2] = {NULL, NULL}; + // must place last for high part of 'mac' + cvk_tl_t *tl_buf[2] = {NULL, NULL}; + cvk_tl_t *tl_kernel, *tl_bias; + + // alloc sram + tl_kernel = test_alloc_tl(cvk_ctx, kernel_shape, CVK_FMT_I8, eu_align); + tl_bias = test_alloc_tl(cvk_ctx, bias_shape, CVK_FMT_I8, /*eu_align=*/0); + + int alloc_nr = is_pingpong ? 2 : 1; + for (int i = 0; i < alloc_nr; i++) { + tl_ifmap2[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align); + tl_ifmap3[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align); + tl_ofmap[i] = test_alloc_tl(cvk_ctx, ofmap_shape, fmt, eu_align); + tl_mask[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align); + tl_update_tbl[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align); + // must place last for high part of 'mac' + tl_buf[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align); + } + + // NOTICE: consider residual + int load_offset = 0; + int store_offset = 0; + int ret; + int curr = flip; + long elapsed; + struct timeval t0, t1; + gettimeofday(&t0, NULL); + + if (!is_pingpong) { + int off = 0; + for (int i = 0; i < tiles; i++) { + // NOTICE: load each loop + test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_kernel, kernel_data); + test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_bias, bias_data); + + load(rt_ctx, cvk_ctx, tl_ifmap2[curr], input_data2 + off, tl_ifmap3[curr], input_data3 + off, + tl_ofmap[curr], _ref_data + off, tl_mask[curr], mask + off, tl_update_tbl[curr], + update_tbl + off); + + int ret = testbenchs[testcase_idx].cvm_run( + cvk_ctx, tl_ifmap2[curr], tl_ifmap3[curr], tl_buf[curr], tl_mask[curr], + tl_update_tbl[curr], tl_kernel, tl_bias, threshold, w1, w2, tl_ofmap[curr]); + + if (ret) { + fflush(stderr); + printf("%s", "generate commands fail, return\n"); + exit(-1); + } + + store(rt_ctx, cvk_ctx, testbenchs[testcase_idx].name, tl_ofmap[curr], tpu_output_data + off, + tl_mask[curr], tpu_output_mask + off, shape_sz); + + off += shape_sz; + } + } else { + // TODO: not load at once + int operand_num = 1; + int input_flip = 0; + int output_flip = 0; + for (int i = 0; i < tiles + 2; i++) { + cvk_ctx->ops->parallel_enable(cvk_ctx); + // NOTICE: load each loop + test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_kernel, kernel_data); + test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_bias, bias_data); + + // send device memory to sram + if ((i - 2) >= 0 && (i - 2) % operand_num == operand_num - 1) { + int curr = 1 - output_flip; + store(rt_ctx, cvk_ctx, testbenchs[testcase_idx].name, tl_ofmap[curr], + tpu_output_data + store_offset, tl_mask[curr], tpu_output_mask + store_offset, + shape_sz); + store_offset += shape_sz; + } + + if (i - 1 >= 0 && i - 1 < tiles) { + // get data from tl + int curr = 1 - input_flip; + // prepare command buffer + ret = testbenchs[testcase_idx].cvm_run( + cvk_ctx, tl_ifmap2[curr], tl_ifmap3[curr], tl_buf[curr], tl_mask[curr], + tl_update_tbl[curr], tl_kernel, tl_bias, threshold, w1, w2, tl_ofmap[curr]); + + if (ret) { + fflush(stderr); + printf("%s", "generate commands fail, return\n"); + exit(-1); + } + output_flip = 1 - output_flip; + } + + if (i < tiles) { + load(rt_ctx, cvk_ctx, tl_ifmap2[input_flip], input_data2 + load_offset, + tl_ifmap3[input_flip], input_data3 + load_offset, tl_ofmap[input_flip], + _ref_data + load_offset, tl_mask[input_flip], mask + load_offset, + tl_update_tbl[input_flip], update_tbl + load_offset); + load_offset += shape_sz; + input_flip = 1 - input_flip; + } + cvk_ctx->ops->parallel_disable(cvk_ctx); + } + } + + // submit descriptor + test_submit_comp(rt_ctx, cvk_ctx); + + gettimeofday(&t1, NULL); + elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; + + // compare with reference with byte + debug_print("%s comparing...", testbenchs[testcase_idx].name); + for (uint32_t i = 0; i < (uint32_t)ofmap_bytesize; i++) { + if (ref_data[i] != tpu_output_data[i]) { + debug_print("comparing failed output[%u] got %u, ref %u\n", i, tpu_output_data[i], + ref_data[i]); + // fail case + fflush(stderr); + exit(-1); + } + } + + // compare another export information + if (!strcmp(testbenchs[testcase_idx].name, "update_tbl_by_threshold")) { + for (uint32_t i = 0; i < (uint32_t)shape_sz; i++) { + if (mask[i] != tpu_output_mask[i]) { + debug_print("comparing mask failed output[%u] got %u, ref %u\n", i, tpu_output_mask[i], + mask[i]); + // fail case + fflush(stderr); + exit(-1); + } + } + } + + if (tiles == 1) { + debug_print("%s", " pass\n"); + } else { + // get elapsed time + debug_print("(takes %ld us)\n", elapsed); + } + + // free resource from tpu memory + for (int i = alloc_nr - 1; i >= 0; --i) { + free_tl(cvk_ctx, tl_buf[i]); + free_tl(cvk_ctx, tl_update_tbl[i]); + free_tl(cvk_ctx, tl_mask[i]); + free_tl(cvk_ctx, tl_ofmap[i]); + free_tl(cvk_ctx, tl_ifmap3[i]); + free_tl(cvk_ctx, tl_ifmap2[i]); + } + free_tl(cvk_ctx, tl_bias); + free_tl(cvk_ctx, tl_kernel); + + // free resource from host memory + // free(input_data); + free(ref_data); + free(tpu_output_data); + free(tpu_output_mask); + free(input_data2); + free(input_data3); + free(mask); + free(update_tbl); + free(_ref_data); + free(kernel_data); + free(bias_data); +} + +int main() { + CVI_RT_HANDLE rt_ctx; + cvk_context_t *cvk_ctx; + + // init runtime / kerenl structure + test_init(&rt_ctx, &cvk_ctx); + + cvk_tg_shape_t tg_shape = {1, 20, 3, 4}; + + // run test + int testbench_nr = sizeof(testbenchs) / sizeof(testbenchs[0]); + + for (int i = 0; i < testbench_nr; i++) { + testbench(&rt_ctx, cvk_ctx, &tg_shape, i); + } +#if 1 + + // run test without ping-pong + tg_shape = {1, 128, 340, 16}; + + printf("[heavy data] w/o ping pong\n"); + + // NOTICE: only check c + int tiles = std::ceil(tg_shape.c / (float)cvk_ctx->info.npu_num); + if (tg_shape.c > cvk_ctx->info.npu_num) { + debug_print("tile nr %d channel base one npu nr %d\n", tiles, cvk_ctx->info.npu_num); + } + + for (int i = 0; i < testbench_nr; i++) { + testbench(&rt_ctx, cvk_ctx, &tg_shape, i); + } + + tg_shape = {1, 128, 340, 16}; + printf("[heavy data] w/ ping pong\n"); + for (int i = 0; i < testbench_nr; i++) { + testbench(&rt_ctx, cvk_ctx, &tg_shape, i, /*is_pingpong=*/true); + } +#endif + // de-init runtime / kerenl structure + test_exit(&rt_ctx, cvk_ctx); + + printf("all pass\n"); + + return 0; +} diff --git a/cvimath/sample/sample_sigmoid_linear_interp.cpp b/cvimath/sample/sample_sigmoid_linear_interp.cpp new file mode 100644 index 000000000..74db96cf8 --- /dev/null +++ b/cvimath/sample/sample_sigmoid_linear_interp.cpp @@ -0,0 +1,165 @@ +// \file implement activation function(sigmoid) by interpolation lookup table, +// please refer [here](https://en.wikipedia.org/wiki/Linear_interpolation) for more details + +// header include +#include +#include // math +#include // kerenl + +// ========== user config ============ +#define MAX_ERROR (0.004) // tolerance +// for current example, we quauntize data to -8 ~ +8 +// range depend on ur activation +static int range_start = -8; +static int range_end = 8; +// ========== end of user config ============ + +// gen reference by cpu +static double _gen_sigmoid(float x) { return 1.0 / (1.0 + exp(-(x))); } + +// gen reference +static void gen_ref(uint16_t *ofmap, uint16_t *ifmap, cvk_tl_shape_t ifmap_shape) { + for (uint64_t i = 0; i < tl_shape_size(&ifmap_shape); i++) { + ofmap[i] = convert_fp32_bf16(_gen_sigmoid(convert_bf16_fp32(ifmap[i]))); + } +} + +// verify cpu data with tpu +static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint64_t ofmap_size) { + int count = 0; + uint64_t size = ofmap_size; + + for (uint64_t i = 0; i < size; i++) { + float got = convert_bf16_fp32(ofmap_data[i]); + float exp = convert_bf16_fp32(ref_data[i]); + + if (fabs(got - exp) > MAX_ERROR) { + fprintf(stderr, + "[%d] comparing failed at ofmap_data[%u], got %x, exp %x, " + "diff(%f - %f) is %f\n", + count, (uint32_t)i, ofmap_data[i], ref_data[i], got, exp, fabs(got - exp)); + count++; + } + } + + // exit if fail + if (count != 0) { + printf("error count is %d\n", count); + exit(-1); + } + + return true; +} + +// gen random input for test +static void gen_input(uint16_t *ifmap, uint64_t ifmap_size) { + int table_hw = 256; + for (uint64_t i = 0; i < ifmap_size; i++) { + // input range is -8 ~ +8 + float input = ((int)i % 7) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002; + ifmap[i] = convert_fp32_bf16(input); + } +} + +// main code for test sigmoid interpolate implement by lookup table +static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) { + // example for input tensor + cvk_tl_shape_t ifmap_shape = {1, 32, 16, 16}; + cvk_fmt_t fmt = CVK_FMT_BF16; + + // get table / input shape + cvk_tl_shape_t table_shape; + cvm_table_shape(bmk, &table_shape); + cvk_tl_shape_t ofmap_shape = ifmap_shape; + + uint64_t ifmap_size = tl_shape_size(&ifmap_shape); + uint64_t table_size = tl_shape_size(&table_shape); + uint64_t ofmap_size = tl_shape_size(&ofmap_shape); + + // get table/input size + int data_type_size = 1; + if (fmt == CVK_FMT_BF16) { + // bf16 takes 2 bytes + data_type_size = 2; + } + + uint64_t ifmap_bytesize = ifmap_size * data_type_size; + uint64_t table_bytesize = table_size * data_type_size; + uint64_t ofmap_bytesize = ofmap_size * data_type_size; + + // alloc host memory + uint16_t *ifmap = (uint16_t *)xmalloc(ifmap_bytesize); + uint16_t *table_data = (uint16_t *)xmalloc(table_bytesize); + uint16_t *table_data_slope = (uint16_t *)xmalloc(table_bytesize); + uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize); + + // gen input and assign data in host + gen_input(ifmap, ifmap_size); + + // gen table, interpolation need 2 tables, one for lookup, another one is slope + cvm_sigmoid_tbl(table_data, table_data_slope, &table_shape, range_start, range_end); + + // gen reference + gen_ref(ref_data, ifmap, ofmap_shape); + + // alloc input / output / tmp / lookup table / slope table + cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1); + cvk_tl_t *cvk_tl_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *cvk_tl_table_answer_slope = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1); + + // device memory load to local memory + test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)ifmap); + test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer, (uint8_t *)table_data); + test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer_slope, (uint8_t *)table_data_slope); + + // get quantize(scale) value + float scale = cvm_sigmoid_scale(range_start, range_end); + + // emit core function + cvm_emit_sigmoid(bmk, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_slope, + tl_ofmap_bf16, scale); + + // get result from device to host + uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, tl_ofmap_bf16); + + // verify data with tolerance + verify(ofmap_data, ref_data, ofmap_size); + + // release device memory in revert order + free_tl(bmk, tl_ofmap_bf16); + free_tl(bmk, tl_buf); + free_tl(bmk, cvk_tl_table_answer_slope); + free_tl(bmk, cvk_tl_table_answer); + free_tl(bmk, tl_ifmap); + + // release host memory + free(ifmap); + free(table_data); + free(table_data_slope); + free(ref_data); + free(ofmap_data); +} + +int main() { + CVI_RT_HANDLE ctx; + cvk_context_t *bmk; + int round_mode; + + round_mode = set_store_feround(); + + // init runtime / kerenl structure + test_init(&ctx, &bmk); + + // emit test case + testbench(&ctx, bmk); + + // de-init runtime / kerenl structure + test_exit(&ctx, bmk); + + // restore rounding + restore_feround(round_mode); + + return 0; +} diff --git a/cvimath/sample/sample_upsample.cpp b/cvimath/sample/sample_upsample.cpp new file mode 100644 index 000000000..c101967da --- /dev/null +++ b/cvimath/sample/sample_upsample.cpp @@ -0,0 +1,145 @@ +// \file sample for set value by mask, plz refer \cvimath_internal.h for more details + +// header include +#include +#include // math +#include // kerenl + +static void init_input(uint8_t *input_data, uint64_t ifmap_size) { + for (uint64_t i = 0; i < ifmap_size; i++) { + input_data[i] = i; + } +} + +static void init_weight(uint8_t *weight_data, uint64_t weight_size) { + for (uint64_t i = 0; i < weight_size; i++) { + weight_data[i] = 1; // NOTICE: MUST init as 1 under nearest upsample case + } +} + +static int init_ref(uint8_t *input, uint8_t *output, int n, int c, int ih, int iw, int scale_h, + int scale_w) { + int h = ih * scale_h; + int w = iw * scale_w; + for (int ni = 0; ni < n; ni++) { + for (int ci = 0; ci < c; ci++) { + for (int hi = 0; hi < h; hi++) { + for (int wi = 0; wi < w; wi++) { + int nwi = wi / scale_w; + int nhi = hi / scale_h; + int out_idx = (((ni * c + ci) * h) + hi) * w + wi; + int in_idx = (((ni * c + ci) * (h / scale_h)) + nhi) * (w / scale_w) + nwi; + output[out_idx] = input[in_idx]; + } + } + } + } + return 0; +} + +static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, cvk_tg_shape_t *tg_shape) { + // for calculate size we need in host + cvk_tl_shape_t ifmap_shape = {tg_shape->n, tg_shape->c, tg_shape->h, tg_shape->w}; + + // upsample scale, e.g: scale_h = 3,scale_w 2, input h = 4, input w = 5 + // output h is 4 * 3 = 12, output w is 5 * 2 = 10 with nearest + + int scale_h = 3; + int scale_w = 2; + + // set output shape + cvk_tl_shape_t ofmap_shape = ifmap_shape; + ofmap_shape.h = ofmap_shape.h * scale_h; + ofmap_shape.w = ofmap_shape.w * scale_w; + + cvk_tl_shape_t weight_shape = ifmap_shape; + weight_shape.h = scale_h; + weight_shape.w = scale_w; + + uint64_t ifmap_size = tl_shape_size(&ifmap_shape); + uint64_t ofmap_size = tl_shape_size(&ofmap_shape); + uint64_t weight_size = tl_shape_size(&weight_shape); + + // unit size is 1 bytes for int/uint 8 + int data_type_size = 1; + + // get input/output size + uint64_t ifmap_bytesize = ifmap_size * data_type_size; + uint64_t ofmap_bytesize = ofmap_size * data_type_size; + uint64_t weight_bytesize = weight_size * data_type_size; + + // alloc on ddr + // uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize); + uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize); + uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize); + uint8_t *weight_data = (uint8_t *)xmalloc(weight_bytesize); + + // init input / output data in ddr + init_input(input_data, ifmap_size); + init_weight(weight_data, weight_bytesize); // fix pattern + init_ref(input_data, ref_data, ifmap_shape.n, ifmap_shape.c, ifmap_shape.h, ifmap_shape.w, + scale_h, scale_w); + + // alloc on sram + cvk_fmt_t fmt = CVK_FMT_I8; + int eu_align = 1; + cvk_tl_t *tl_ifmap = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align); + cvk_tl_t *tl_weight = test_alloc_tl(cvk_ctx, weight_shape, fmt, eu_align); + cvk_tl_t *tl_ofmap = test_alloc_tl(cvk_ctx, ofmap_shape, fmt, eu_align); + + // send device memory to sram + test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ifmap, input_data); + test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_weight, weight_data); + + // generate descriptor + cvm_upsample2d(cvk_ctx, tl_ifmap, tl_weight, tl_ofmap); + + // submit descriptor + test_submit_comp(rt_ctx, cvk_ctx); + + // get data from tl + uint8_t *ofmap_data = (uint8_t *)test_get_tensor_l2g_comp(rt_ctx, cvk_ctx, tl_ofmap); + + // compare with reference with byte + for (uint32_t i = 0; i < ofmap_size; i++) { + if (ref_data[i] != ofmap_data[i]) { + fprintf(stderr, "comparing failed output[%u] got %u, ref %u\n", i, ofmap_data[i], + ref_data[i]); + // fail case + fflush(stderr); + exit(-1); + } + } + + // free resource from tpu memory + free_tl(cvk_ctx, tl_ofmap); + free_tl(cvk_ctx, tl_weight); + free_tl(cvk_ctx, tl_ifmap); + + // free resource from host memory + free(ref_data); + free(weight_data); + free(ofmap_data); + free(input_data); +} + +int main() { + CVI_RT_HANDLE rt_ctx; + cvk_context_t *cvk_ctx; + + // init runtime / kerenl structure + test_init(&rt_ctx, &cvk_ctx); + + cvk_tg_shape_t tg_shape = {1, 20, 3, 4}; + // cvk_tg_shape_t tg_shape = {1, 20, 3, 40}; + + // run test + testbench(&rt_ctx, cvk_ctx, &tg_shape); + + // de-init runtime / kerenl structure + test_exit(&rt_ctx, cvk_ctx); + + printf("pass\n"); + + return 0; +} diff --git a/cvimath/src/1880v2_fp_convert.c b/cvimath/src/1880v2_fp_convert.c new file mode 100644 index 000000000..f750f11df --- /dev/null +++ b/cvimath/src/1880v2_fp_convert.c @@ -0,0 +1,293 @@ +#ifndef ATOMIC_FP_H_ +#define ATOMIC_FP_H_ + +#if __arm__ +#define __DISABLE_FENV__ +#endif + +#ifndef __DISABLE_FENV__ +#include +#endif +#include +#include // uint8_t / uint16_t + +#ifdef __cplusplus +extern "C" { +#endif + +uint8_t convert_bf16_u8(uint16_t data); +uint8_t _convert_bf16_u8(uint16_t data, int int8_rnd_md); +int8_t _convert_bf16_s8(uint16_t data, int int8_rnd_md); +int8_t convert_bf16_s8(uint16_t data); +uint16_t convert_int8_bf16(uint8_t data, uint8_t sign); +uint32_t convert_fp32_u32(float fp32); +uint32_t convert_fp32_hex(float val); +float convert_hex_fp32(uint32_t hval); + +float convert_bf16_fp32(uint16_t bf16); +uint16_t convert_fp32_bf16(float fp32); + +void f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed, + int int8_rnd_md); +// void f32_integer(void *if32, void *o_integer, +// 0 for 32 bit , 1 for 16 bit , 2 for 8 bit +// int integer_size, int accumulate = 0, int int8_signed = 1, int int8_rnd_md = 0); + +union convert_type_float { + float fval; + uint16_t bf16[2]; + uint32_t ival; +}; + +typedef union convert_type_float convert_int_float; +static const uint16_t NAN_VALUE = 0x7FC0; + +// static int round_mode = 0; +static uint8_t float_isnan(const float x) { + // return isnan(x); + return x != x; +} + +int set_store_feround() { +#ifndef __DISABLE_FENV__ + int round_mode = fegetround(); + fesetround(FE_TOWARDZERO); + return round_mode; +#else + return 0; +#endif +} + +void restore_feround(int round_mode) { +#ifndef __DISABLE_FENV__ + fesetround(round_mode); +#else + (void)round_mode; +#endif +} + +uint8_t _convert_bf16_u8(uint16_t data, int int8_rnd_md) { + /* convert bf16 to float32*/ + float fp32; + convert_int_float convert_val; + fp32 = convert_bf16_fp32(data); + /* convert float32 to uint8_t*/ + f32_integer((void *)&fp32, &convert_val.ival, 2, 0, 0, int8_rnd_md); + return (uint8_t)convert_val.ival; +} + +uint8_t convert_bf16_u8(uint16_t data) { return (uint8_t)_convert_bf16_u8(data, 0); } + +int8_t _convert_bf16_s8(uint16_t data, int int8_rnd_md) { + /* convert bf16 to float32*/ + float fp32; + convert_int_float convert_val; + fp32 = convert_bf16_fp32(data); + /* convert float32 to uint8_t*/ + f32_integer((void *)&fp32, &convert_val.ival, 2, 0, 1, int8_rnd_md); + return (int8_t)convert_val.ival; +} + +int8_t convert_bf16_s8(uint16_t data) { return (int8_t)_convert_bf16_s8(data, 0); } + +uint16_t convert_int8_bf16(uint8_t data, uint8_t sign) { + int32_t val = sign ? (int8_t)data : (uint8_t)data; + /* need to round to bf16 mode */ + return convert_fp32_bf16((float)val); +} + +uint16_t convert_fp32_bf16(float fp32) { + if (float_isnan(fp32)) return NAN_VALUE; + convert_int_float convert_val; + convert_val.fval = fp32; + uint32_t input = convert_val.ival; + uint32_t lsb = (input >> 16) & 1; + uint32_t rounding_bias = 0x7fff + lsb; + input += rounding_bias; + convert_val.bf16[1] = (uint16_t)(input >> 16); + + /* HW behavior */ + if ((convert_val.bf16[1] & 0x7f80) == 0x7f80) { + convert_val.bf16[1] = 0x7f7f; + } + return convert_val.bf16[1]; +} + +uint8_t convert_fp32_u8(float fp32) { + convert_int_float convert_val; + f32_integer((void *)&fp32, &convert_val.ival, 2, 0, 0, 0); + return (uint8_t)convert_val.ival; +} + +int8_t convert_fp32_s8(float fp32) { + convert_int_float convert_val; + f32_integer((void *)&fp32, &convert_val.ival, 2, 0, 1, 0); + return (int8_t)convert_val.ival; +} + +uint32_t convert_fp32_u32(float fp32) { + convert_int_float convert_val; + f32_integer((void *)&fp32, &convert_val.ival, 0, 0, 0, 0); + return (uint32_t)convert_val.ival; +} + +int32_t convert_fp32_s32(float fp32) { + convert_int_float convert_val; + f32_integer((void *)&fp32, &convert_val.ival, 0, 0, 1, 0); + return (int32_t)convert_val.ival; +} + +/* convert hex to float directly */ +float convert_hex_fp32(uint32_t hval) { + convert_int_float convert_val; + convert_val.ival = hval; + return convert_val.fval; +} +/* convert float to hex directly */ +uint32_t convert_fp32_hex(float val) { + convert_int_float convert_val; + convert_val.fval = val; + return convert_val.ival; +} +float convert_bf16_fp32(uint16_t bf16) { + convert_int_float convert_val; + convert_val.bf16[1] = bf16; + convert_val.bf16[0] = 0; + return convert_val.fval; +} + +void flt2int_flt(float x, unsigned long long *integer_part, float *sub_part, uint8_t sign) { + convert_int_float work_x; + int level_code; + unsigned long tail_code; + work_x.fval = x; + level_code = ((work_x.ival >> 23) & 0xff) - 127; + + // if the level code is negaive, the integer part of the float is zero + if (level_code < 0) { + *integer_part = 0; + *sub_part = x; + } else { + tail_code = (work_x.ival) & 0x7fffff; + tail_code = tail_code | 0x800000; + + if (level_code < 23) { + tail_code >>= (23 - level_code); + *integer_part = tail_code; + work_x.ival &= 0xffffffff << (23 - level_code); + *sub_part = x - work_x.fval; + } else { + tail_code <<= (level_code - 23); + *integer_part = tail_code; + if (level_code > 30) { + *integer_part = 0x7fffffff; + if (sign) *integer_part = 0x800000000; + } + *sub_part = 0; + } + } +} + +inline static int flt2int(float ifval, int int8_rnd_md) { + union { + float floatNum; + unsigned long intNum; + } tempIfval; + tempIfval.floatNum = ifval; + uint8_t isPositive = ((tempIfval.intNum & 0x80000000UL) == 0x80000000UL) ? 0 : 1; + float abs_fval = (!isPositive) ? -ifval : ifval; + double sub_part; + double integer; + unsigned long long integer_part; + // uint8_t sign = !isPositive; + // flt2int_flt(abs_fval, &integer_part, &sub_part, sign); + sub_part = modf((double)abs_fval, &integer); + integer_part = (unsigned long long)integer; + if (!isPositive) { + unsigned long long result; + if (int8_rnd_md == 0) { // round to nearest even + if (sub_part > 0.5f) { + result = integer_part + 1; + } else if (sub_part == 0.5f) { + if (integer_part & 0x1) { + result = integer_part + 1; + } else { + result = integer_part; + } + } else { + result = integer_part; + } + } else { // round to zero + result = integer_part; + } + if (result > 0x80000000UL) { + result = 0x80000000UL; + } + return -result; + } else { + unsigned long long result; + if (int8_rnd_md == 0) { // round to nearest even + if (sub_part > 0.5f) { + result = integer_part + 1; + } else if (sub_part == 0.5f) { + if (integer_part & 0x1) { + result = integer_part + 1; + } else { + result = integer_part; + } + } else { + result = integer_part; + } + } else { + result = integer_part; + } + if (result > 0x7fffffff) { + result = 0x7fffffff; + } + return result; + } +} + +void f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed, + int int8_rnd_md) { + int i_tmp; + float *f_tmp; + f_tmp = (float *)if32; + i_tmp = flt2int(*f_tmp, int8_rnd_md); + int *o32 = (int *)o_integer; + int dst_f32 = *o32; + short *o16 = (short *)o_integer; + short dst_o16 = *o32; + char *o8 = (char *)o_integer; + char dst_o8 = *o8; + + if (integer_size == 0) { + *o32 = i_tmp; + } else if (integer_size == 1) { + *o16 = i_tmp; + } else { + *o8 = i_tmp; + int min = (int8_signed) ? -128 : 0; + int max = (int8_signed) ? 127 : 255; + if (i_tmp < min) { + *o8 = min; + } else if (i_tmp > max) { + *o8 = max; + } + //*o8 = i_tmp; + } + if (accumulate) { + if (integer_size == 0) { + *o32 += dst_f32; + } else if (integer_size == 1) { + *o16 += dst_o16; + } else + *o8 += dst_o8; + } +} + +#ifdef __cplusplus +} +#endif + +#endif /* ATOMIC_FP_H_ */ diff --git a/cvimath/src/CMakeLists.txt b/cvimath/src/CMakeLists.txt new file mode 100644 index 000000000..72ba03147 --- /dev/null +++ b/cvimath/src/CMakeLists.txt @@ -0,0 +1,12 @@ +project(cvimath) + +include_directories(${CMAKE_SOURCE_DIR}/include) +file(GLOB SRC ./*.c ./*.cpp) + +add_library(${PROJECT_NAME} SHARED ${SRC}) +target_link_libraries(${PROJECT_NAME} ${TPU_KERNEL_LIB}) +install(TARGETS ${PROJECT_NAME} DESTINATION lib) + +add_library(${PROJECT_NAME}-static STATIC ${SRC}) +target_link_libraries(${PROJECT_NAME}-static ${TPU_KERNEL_LIB}) +install(TARGETS ${PROJECT_NAME}-static DESTINATION lib) diff --git a/cvimath/src/bf16_gemm.c b/cvimath/src/bf16_gemm.c new file mode 100644 index 000000000..aeb8e9255 --- /dev/null +++ b/cvimath/src/bf16_gemm.c @@ -0,0 +1,1361 @@ +#include +#include "gen_lut.h" // NOLINT + +#define DEBUG_TYPE "bmnet_bf16_fc_kernel" + +#define RELU 0 +#define PRELU 1 + +#define ENABLE_DBG +#ifdef ENABLE_DBG +#define LLVM_DEBUG(msg) msg +#else +#define LLVM_DEBUG(msg) +#endif + +#define NEURON_MEMORY (0) +#define WEIGHT_MEMORY (1) + +// declare in DTCM for preventing xmalloc/free +static cvk_ml_t matrix_lmem[5]; +#define DEBUG (0) +#define DBG(fmt, ...) \ + do { \ + if (DEBUG) fprintf(stderr, fmt, __VA_ARGS__); \ + } while (0) + +// gemm used +uint32_t lmem_ptr = 0; // FIXME: move to kernel +cvk_ml_t *bmk1880v2_matrix_lmem_prealloc_align(cvk_context_t *ctx, cvk_ml_t *pre, uint32_t la, + cvk_ml_shape_t s, cvk_fmt_t fmt, int eu_align) { + uint32_t lmem_size = ctx->info.lmem_size; + uint32_t npu_num = ctx->info.npu_num; + uint32_t eu_num = ctx->info.eu_num; + uint32_t val = (fmt == CVK_FMT_BF16) ? 2 : 1; + + cvk_ml_t *t; + if (pre) { + t = pre; + } else { + t = xmalloc(sizeof(*t)); + } + + t->start_address = la; + t->fmt = fmt; + t->shape = s; + t->stride.h = s.w * val; + if (eu_align) + t->stride.c = align_up(s.w * val, eu_num); + else + t->stride.c = s.w * val; + t->stride.n = t->stride.c * ceiling_func(s.c, npu_num); + + uint32_t needed = align_up(t->shape.n * t->stride.n, eu_num); + + if (lmem_size - lmem_ptr < needed) { + if (!pre) { + free(t); + } + ASSERT(0 && "not enough local memory alloc"); + return NULL; + } + // ctx->lmem_ptr += needed; + lmem_ptr = la + 1; + return t; +} + +void bmk1880v2_lmem_free_prealloc_bf16_matrix(cvk_context_t *ctx, bool is_pre_alloc, + const cvk_ml_t *t) { + // printf("free from %d, lmem_ptr is %d\n", t->start_address, ctx->lmem_ptr); + (void)ctx; + ASSERT(t->start_address < lmem_ptr); + lmem_ptr = t->start_address; + if (!is_pre_alloc) { + free((void *)t); + } +} + +static void tdma_store_stride_bf16(cvk_context_t *ctx, cvk_ml_t *tlp, uint64_t ga_dst, + cvk_mg_stride_t ts_stride, ctrl_t ctrl) { + bool DoTranspose = (ctrl & CTRL_TP) ? true : false; + bool isNeuron = (ctrl & CTRL_NEURON) ? true : false; + + ASSERT(DoTranspose == false); + (void)DoTranspose; + + // tensor in system memory + // Global shape use local shape + // Global shape used for stride calculation + cvk_mg_t ts_data; + ts_data.base_reg_index = isNeuron ? NEURON_MEMORY : WEIGHT_MEMORY; + ts_data.start_address = ga_dst; + ts_data.fmt = tlp->fmt; + ts_data.shape.row = tlp->shape.n; + ts_data.shape.col = tlp->shape.col; + ts_data.stride = ts_stride; + + cvk_tdma_l2g_matrix_copy_param_t p1; + p1.src = tlp; + p1.dst = &ts_data; + ctx->ops->tdma_l2g_bf16_matrix_copy(ctx, &p1); +} + +static void tdma_load_stride_bf16(cvk_context_t *ctx, cvk_ml_t *tlp, uint64_t ga_src, + cvk_mg_stride_t ts_stride, ctrl_t ctrl) { + ASSERT(tlp != NULL); + + bool DoTranspose = (ctrl & CTRL_TP) ? true : false; + bool isNeuron = (ctrl & CTRL_NEURON) ? true : false; + (void)DoTranspose; + + // Global memory from reshaped local memory + cvk_mg_t ts_data; + ts_data.base_reg_index = isNeuron ? NEURON_MEMORY : WEIGHT_MEMORY; + ts_data.start_address = ga_src; + ts_data.fmt = tlp->fmt; + ts_data.shape.row = tlp->shape.n; + ts_data.shape.col = tlp->shape.col; + ts_data.stride = ts_stride; + + // BM1880v2 tdma does not support transposed matrix load + ASSERT(!DoTranspose); + + cvk_tdma_g2l_matrix_copy_param_t p1; + p1.src = &ts_data; + p1.dst = tlp; + ctx->ops->tdma_g2l_bf16_matrix_copy(ctx, &p1); +} +// +// Shape/stride used in TDMA may not the same as in TIU. +// Adjust shape/stride for TIU. +// +// E.g. +// Y(0, 4) = L(1, 256) * R(256, 4) + B(1, 4) +// +// TDMA: +// L(0, 16, 1, 16) +// R(255, 1, 1, 4) +// B(0, 1, 1, 4) +// +// TIU: +// Y res0(1, 1, 1, 16) +// L opd0(1, 16, 1, 16) +// R opd1(256, 1, 1, 16) +// B opd2(1, 1, 1, 16) +// +static void matrix_multiplication(cvk_context_t *ctx, cvk_tiu_matrix_multiplication_param_t *p) { + // No need to adjust shape/stride + if (p->res->shape.w >= ctx->info.eu_num) { + // LLVM_DEBUG(printf(" L(%d, %d), R(%d, %d)\n", p->left->shape.n, + // p->left->shape.col, p->right->shape.n, + // p->right->shape.col);); + ctx->ops->tiu_matrix_multiplication(ctx, p); + + return; + } + + // + // New shape/stride to align ctx->info.eu_num + // adjust w as ctx->info.eu_num + // + cvk_ml_t tl_res; + tl_res.start_address = p->res->start_address; + tl_res.fmt = p->res->fmt; + tl_res.shape.n = p->res->shape.n; + tl_res.shape.c = p->res->shape.c; + tl_res.shape.w = (uint32_t)(ctx->info.eu_num); + tl_res.shape.col = p->res->shape.col; + tl_res.stride = ctx->ops->ml_default_stride(ctx, tl_res.shape, CVK_FMT_BF16, /*eu_align=*/1); + + cvk_ml_t tl_right; + tl_right.start_address = p->right->start_address; + tl_right.fmt = p->right->fmt; + tl_right.shape.n = p->right->shape.n; + tl_right.shape.c = p->right->shape.c; + tl_right.shape.w = (uint32_t)(ctx->info.eu_num); + tl_right.shape.col = p->right->shape.col; + tl_right.stride = ctx->ops->ml_default_stride(ctx, tl_right.shape, CVK_FMT_BF16, /*eu_align=*/1); + + cvk_ml_t tl_bias = {0}; + if (p->bias) { + tl_bias.start_address = p->bias->start_address; + tl_bias.fmt = p->bias->fmt; + tl_bias.shape.n = p->bias->shape.n; + tl_bias.shape.c = p->bias->shape.c; + tl_bias.shape.w = (uint32_t)(ctx->info.eu_num); + tl_bias.shape.col = p->bias->shape.col; + tl_bias.stride = ctx->ops->ml_default_stride(ctx, tl_bias.shape, CVK_FMT_BF16, /*eu_align=*/1); + } + + cvk_tiu_matrix_multiplication_param_t p2; + // copy p to p2 + p2.res = p->res; + p2.left = p->left; + p2.right = p->right; + p2.bias = p->bias; + p2.lshift_bits = p->lshift_bits; + p2.rshift_bits = p->rshift_bits; + p2.res_is_int8 = p->res_is_int8; + p2.add_result = p->add_result; + p2.relu_enable = p->relu_enable; + p2.ps32_mode = p->ps32_mode; + p2.res_is_int8 = p->res_is_int8; + + p2.layer_id = p->layer_id; + // p2.sw_op_info = p->sw_op_info; + + p2.res = &tl_res; + p2.left = p->left; + p2.right = &tl_right; + p2.bias = p->bias ? &tl_bias : NULL; + + LLVM_DEBUG(printf(" Modified L(%d, %d), R(%d, %d)\n", p2.left->shape.n, p2.left->shape.col, + p2.right->shape.n, p2.right->shape.col);); + + ctx->ops->tiu_matrix_multiplication(ctx, &p2); +} + +static void fc_slicing_multi_dimention(cvk_context_t *ctx, uint32_t layer_id, + gaddr_t global_offset_bottom_data, + gaddr_t global_offset_weight_data, + gaddr_t global_offset_bias_data, + gaddr_t global_offset_top_data, int input_row_num, + int input_col_num, int weight_col_num, int have_bias, + int do_activation, int activation_method) { + // Y(M, K) = L(M, K) * R(K, N) + B(1, N) + uint32_t M = (uint32_t)(input_row_num); + uint32_t K = (uint32_t)(input_col_num); + uint32_t N = (uint32_t)(weight_col_num); + + LLVM_DEBUG(printf("fc_slicing_multi_dimension\n" + " Y(%d, %d) = L(%d, %d) * R(%d, %d) + B(%d, %d)\n", + M, N, M, K, K, N, 1, N);); + + // Split N <= max total eu number + uint32_t total_eu = ctx->info.npu_num * ctx->info.eu_num; + uint32_t tiled_N = (N >= total_eu) ? total_eu : N; + + // Split K based on lane size + uint32_t lane_size = ctx->info.lmem_size; + uint32_t max_k = (1 << 12) - 1; // 1880v2: 12 bit + uint32_t tiled_K = (K >= max_k) ? max_k : K; + + // Tiled Y + cvk_ml_t tl_tiled_Y = {0}; + tl_tiled_Y.fmt = CVK_FMT_BF16; + + // Tiled L + cvk_ml_t tl_tiled_L = {0}; + tl_tiled_L.fmt = CVK_FMT_BF16; + + // Tiled R + cvk_ml_t tl_tiled_R = {0}; + tl_tiled_R.fmt = CVK_FMT_BF16; + + // Tiled B + cvk_ml_t tl_tiled_B = {0}; + if (have_bias) { + // ctx->ops->tiu_matrix_multiplication will change shape.n from 2 to 1 + // So we use the shape for both dma load and local memory allocation. + + // Upper16 [31:16] then Lower16 [15:0] separated by b_stride + tl_tiled_B.fmt = CVK_FMT_BF16; + tl_tiled_B.shape = ctx->ops->ml_default_shape(ctx, sizeof(uint32_t) / sizeof(uint16_t), tiled_N, + CVK_FMT_BF16); // 2 x 16bit + tl_tiled_B.stride = + ctx->ops->ml_default_stride(ctx, tl_tiled_B.shape, CVK_FMT_BF16, /*eu_align=*/1); + } + + // Tiled local memory layout: + // Y at fixed position since last tiled ones may be smaller + // + // tiled Y, [7:0] + // tiled Y, [15:8] + // tiled Y, [23:16] + // tiled Y, [31:24] + // tiled L [15:0] + // tiled R [15:0] + // tiled B, [31:16], if existed + // tiled B, [15:0], if existed + + // Find max tiled K + uint32_t required_size = 0; + do { + required_size = 0; // Start of LMEM + + // Not split M since we don't want to reload L(weight) + // or reload partial result of different M. + // + // Y(M, N) = L(M, K) * R(K, N) + B(1, N) + // tiled_Y(M, tiled_N) = tiled_L(M, tiled_K) * tiled_R(tiled_K, tiled_N) + tiled_B(1, tiled_N) + + // tiled Y, 2 * 16bit + tl_tiled_Y.start_address = required_size; + tl_tiled_Y.shape = ctx->ops->ml_default_shape(ctx, M, tiled_N, CVK_FMT_BF16); + tl_tiled_Y.stride = + ctx->ops->ml_default_stride(ctx, tl_tiled_Y.shape, CVK_FMT_BF16, /*eu_align=*/1); + required_size += ctx->ops->lmem_ps32_matrix_to_size(ctx, tl_tiled_Y.shape, CVK_FMT_BF16, + /*eu_align=*/1); + + // tiled L, 16bit + tl_tiled_L.start_address = required_size; + tl_tiled_L.shape = ctx->ops->ml_default_shape(ctx, M, tiled_K, CVK_FMT_BF16); + tl_tiled_L.stride = + ctx->ops->ml_default_stride(ctx, tl_tiled_L.shape, CVK_FMT_BF16, /*eu_align=*/1); + required_size += + ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_L.shape, CVK_FMT_BF16, /*eu_align=*/1); + + // tiled R, 16bit + tl_tiled_R.start_address = required_size; + tl_tiled_R.shape = ctx->ops->ml_default_shape(ctx, tiled_K, tiled_N, CVK_FMT_BF16); + tl_tiled_R.stride = + ctx->ops->ml_default_stride(ctx, tl_tiled_R.shape, CVK_FMT_BF16, /*eu_align=*/1); + required_size += + ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_R.shape, CVK_FMT_BF16, /*eu_align=*/1); + + // tiled B, 2 * 16bit + if (have_bias) { + tl_tiled_B.start_address = required_size; + required_size += + ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_B.shape, CVK_FMT_BF16, /*eu_align=*/1); + } + + if (required_size <= lane_size) { + // LLVM_DEBUG(printf(" tiled_Y %d, tiled_L %d, tiled_R %d, tiled_B %d, required_size %d\n", + // ctx->ops->lmem_ps32_matrix_to_size(ctx, tl_tiled_Y.shape, + // CVK_FMT_BF16, /*eu_align=*/1), + // ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_L.shape, + // CVK_FMT_BF16, /*eu_align=*/1), + // ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_R.shape, + // CVK_FMT_BF16, /*eu_align=*/1), + // ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_B.shape, + // CVK_FMT_BF16, /*eu_align=*/1), required_size);); + + break; + } + + } while (--tiled_K); + + LLVM_DEBUG(printf(" tiled_Y(%d, %d) = tiled_L(%d, %d) * tiled_R(%d, %d) + tiled_B(%d, %d)," + " required_size %d kB\n", + M, tiled_N, M, tiled_K, tiled_K, tiled_N, 1, tiled_N, required_size / 1024);); + + LLVM_DEBUG( + printf(" tiled_Y shape (n=%d, c=%d, w=%d, col=%d), stride(n=%d, c=%d, h=%d)\n" + " tiled_L shape (n=%d, c=%d, w=%d, col=%d), stride(n=%d, c=%d, h=%d)\n" + " tiled_R shape (n=%d, c=%d, w=%d, col=%d), stride(n=%d, c=%d, h=%d)\n" + " tiled_B shape (n=%d, c=%d, w=%d, col=%d), stride(n=%d, c=%d, h=%d)\n", + tl_tiled_Y.shape.n, tl_tiled_Y.shape.c, tl_tiled_Y.shape.w, tl_tiled_Y.shape.col, + tl_tiled_Y.stride.n, tl_tiled_Y.stride.c, tl_tiled_Y.stride.h, tl_tiled_L.shape.n, + tl_tiled_L.shape.c, tl_tiled_L.shape.w, tl_tiled_L.shape.col, tl_tiled_L.stride.n, + tl_tiled_L.stride.c, tl_tiled_L.stride.h, tl_tiled_R.shape.n, tl_tiled_R.shape.c, + tl_tiled_R.shape.w, tl_tiled_R.shape.col, tl_tiled_R.stride.n, tl_tiled_R.stride.c, + tl_tiled_R.stride.h, tl_tiled_B.shape.n, tl_tiled_B.shape.c, tl_tiled_B.shape.w, + tl_tiled_B.shape.col, tl_tiled_B.stride.n, tl_tiled_B.stride.c, tl_tiled_B.stride.h);); + + ASSERT(tiled_K); + if (!tiled_K) { + return; + } + + // Each tiled_R(weight) is only loaded once. + // tiled_L(input) reload is reload once tiled_weight moves right. + // + // for each tiled N + for (uint32_t offset_N = 0; offset_N < N; offset_N += tiled_N) { + // Y = [Y0, Y1, ... Yn-1] + + // Actual width + uint32_t width_N = ((offset_N + tiled_N) <= N) ? tiled_N : (N - offset_N); + + // for each tiled K + for (uint32_t offset_K = 0; offset_K < K; offset_K += tiled_K) { + // Y(M, K) = L(M, K) * R(K, N) + B(1, N) + // tiled_Y(M, tiled_K) = tiled_L(M, tiled_K) * tiled_R(tiled_K, tiled_N) + tiled_B(1, tiled_N) + // + // L = [L0, L1, ... Lk-1] + // R = [R0,0, R0,1, ..., R0,n-1 + // R1,0, + // + // Rk-1,0, Rk-1,1, ..., Rk-1,n-1] + // B = [B0, B1, ... Bn-1] + // + // tiled_y,i += L0 * R0,i + L1 * R1,i + ... + Ln-1 * Rk-1,i + Bi + + // Actual width + uint32_t width_K = ((offset_K + tiled_K) <= K) ? tiled_K : (K - offset_K); + + required_size = 0; // Start of LMEM + + // tiled Y, 32bit + tl_tiled_Y.start_address = required_size; + tl_tiled_Y.shape = ctx->ops->ml_default_shape(ctx, M, width_N, CVK_FMT_BF16); // actual width + required_size += ctx->ops->lmem_ps32_matrix_to_size(ctx, tl_tiled_Y.shape, CVK_FMT_BF16, + /*eu_align=*/1); + + // Load tiled L from global memory, input + tl_tiled_L.start_address = required_size; + tl_tiled_L.shape = ctx->ops->ml_default_shape(ctx, M, width_K, CVK_FMT_BF16); // actual width + tl_tiled_L.stride = ctx->ops->ml_default_stride(ctx, tl_tiled_L.shape, CVK_FMT_BF16, + /*eu_align=*/1); + required_size += + ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_L.shape, CVK_FMT_BF16, /*eu_align=*/1); + cvk_mg_stride_t ts_stride; + ts_stride.row = K * sizeof(uint16_t); + tdma_load_stride_bf16(ctx, &tl_tiled_L, + global_offset_bottom_data + offset_K * sizeof(uint16_t), + ts_stride, // original column width + CTRL_NEURON); + + // Load tiled R from global memory, weight + tl_tiled_R.start_address = required_size; + tl_tiled_R.shape = + ctx->ops->ml_default_shape(ctx, width_K, width_N, CVK_FMT_BF16); // actual width + tl_tiled_R.stride = ctx->ops->ml_default_stride(ctx, tl_tiled_R.shape, CVK_FMT_BF16, + /*eu_align=*/1); + required_size += + ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_R.shape, CVK_FMT_BF16, /*eu_aligned=*/1); + + ts_stride.row = N * sizeof(uint16_t); + tdma_load_stride_bf16( + ctx, &tl_tiled_R, + global_offset_weight_data + (offset_K * N + offset_N) * sizeof(uint16_t), + ts_stride, // original column width + CTRL_NEURON); + + // Load tiled B(bias) from gobale memory at last time as H/W does + // we need temporary shape to load uppper 16bit and lower 16bit + bool is_last_tile = ((offset_K + tiled_K) >= K) ? true : false; + bool B_needed = (is_last_tile && have_bias) ? true : false; + if (B_needed) { + tl_tiled_B.start_address = required_size; + + tl_tiled_B.shape = + ctx->ops->ml_default_shape(ctx, sizeof(uint32_t) / sizeof(uint16_t), width_N, + CVK_FMT_BF16); // 2 x 16bit, actual width + tl_tiled_B.stride = + ctx->ops->ml_default_stride(ctx, tl_tiled_B.shape, CVK_FMT_BF16, /*eu_align=*/1); + required_size += ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_B.shape, CVK_FMT_BF16, + /*eu_aligned=*/1); + ASSERT(required_size <= lane_size); + + ts_stride.row = N * sizeof(uint16_t); + tdma_load_stride_bf16(ctx, &tl_tiled_B, + global_offset_bias_data + offset_N * sizeof(uint16_t), + ts_stride, // original column width + CTRL_NEURON); + } + + uint32_t ps32_mode = 0; // normal mode + uint32_t relu_enable = 0; // 1880v2 relu can be used in ps32_mode + if (tiled_K < K) { + if (offset_K == 0) { // first tile + ps32_mode = 2; // write 32b result at the first time + } else if (is_last_tile) { // last tile + ps32_mode = 1; // load previous 32-bit result + } else { + ps32_mode = 3; // init & write 32bits partial sum + } + } + + // No tiling or last tile + if ((ps32_mode == 0 || ps32_mode == 1) && do_activation && activation_method == RELU) { + relu_enable = 1; + } + + { + cvk_tiu_matrix_multiplication_param_t p; + p.res = &tl_tiled_Y; + p.left = &tl_tiled_L; + p.right = &tl_tiled_R; + p.bias = B_needed ? &tl_tiled_B : NULL; + p.lshift_bits = 0; // deprecated + p.rshift_bits = 0; + p.res_is_int8 = 1; // H/W constraint + p.add_result = 0; // H/W constraint + p.relu_enable = relu_enable; + p.ps32_mode = ps32_mode; + p.res_is_int8 = 1; + + p.layer_id = layer_id; + // p.sw_op_info = offset_N; + + LLVM_DEBUG(printf(" [offset_N=%d][offset_K=%d] L(%d, %d), R(%d, %d)\n", offset_N, offset_K, + p.left->shape.n, p.left->shape.col, p.right->shape.n, + p.right->shape.col);); + + matrix_multiplication(ctx, &p); + } + + // Store tiled_Y to global memory + if (is_last_tile) { + ts_stride.row = N * sizeof(uint16_t); + tdma_store_stride_bf16(ctx, &tl_tiled_Y, + global_offset_top_data + offset_N * sizeof(uint16_t), + ts_stride, // original column width + CTRL_NEURON); + } + + } // for (uint32_t offset_K = 0; offset_K < K; offset_K += tiled_K) + + } // for (uint32_t offset_N = 0; offset_N < N; offset_N += tiled_N) +} + +void cvm_fc_forward_kernel(cvk_context_t *ctx, uint32_t layer_id, gaddr_t bottom_data_gaddr, + gaddr_t weight_data_gaddr, gaddr_t bias_data_gaddr, + gaddr_t top_data_gaddr, int in_row, int in_col, int out_col, + int have_bias, int do_activation, int activation_method) { + // LLVM_DEBUG( + // printf("bf16_fc_forward_kernel\n" + // " bottom_gaddr 0x%lx, weight_gaddr 0x%lx, bias_gaddr 0x%lx, top_gaddr 0x%lx\n" + // " in (%d, %d), out (%d)\n" + // " has_bias %d, do_activation %d, activation_method %d\n", + // bottom_data_gaddr, weight_data_gaddr, bias_data_gaddr, top_data_gaddr, in_row, + // in_col, out_col, have_bias, do_activation, activation_method);); + + fc_slicing_multi_dimention(ctx, layer_id, bottom_data_gaddr, weight_data_gaddr, bias_data_gaddr, + top_data_gaddr, in_row, in_col, out_col, have_bias, do_activation, + activation_method); +} + +// gemm +inline static size_t get_neuron_csize_local(cvk_context_t *ctx, size_t h, size_t w, cvk_fmt_t fmt) { + size_t size = h * w * bitsize_of_fmt(fmt) / 8; + // ctx->info.eu_num neurons align + return ALIGN(size, ctx->info.eu_num); +} + +static int get_fmt_byte_sz(cvk_fmt_t fmt) { return bitsize_of_fmt(fmt) / 8; } + +static uint64_t get_slice_global_offset(uint64_t global_offset, size_t row_slice_idx, + size_t col_slice_idx, size_t row_num, size_t col_num, + size_t row_slice_num, size_t col_slice_num, cvk_fmt_t fmt) { + int fmt_byte_sz = get_fmt_byte_sz(fmt); + uint64_t slice_offset_row = 0; + if (row_slice_idx < (row_num % row_slice_num)) { + slice_offset_row = row_slice_idx * (row_num / row_slice_num + 1); + } else { + slice_offset_row = (row_num % row_slice_num) * (row_num / row_slice_num + 1) + + (row_slice_idx - (row_num % row_slice_num)) * (row_num / row_slice_num); + } + + uint64_t slice_offset_col = 0; + if (col_slice_idx < (col_num % col_slice_num)) { + slice_offset_col = col_slice_idx * (col_num / col_slice_num + 1); + } else { + slice_offset_col = (col_num % col_slice_num) * (col_num / col_slice_num + 1) + + (col_slice_idx - (col_num % col_slice_num)) * (col_num / col_slice_num); + } + + uint64_t slice_offset; + slice_offset = (slice_offset_col + slice_offset_row * col_num) * fmt_byte_sz; + return (global_offset + slice_offset); +} + +#define LOCAL_MEM_BANKS (ctx->info.lmem_banks) +#define NPU_SHIFT (get_num_shift(ctx->info.npu_num)) +#define MAX(a, b) (((a) > (b)) ? (a) : (b)) +//#define SMALL_TEST (1) +static size_t get_slice_num(cvk_context_t *ctx, size_t M, size_t N, size_t K, size_t *slice_num, + cvk_fmt_t fmt) { + int fmt_byte_sz = get_fmt_byte_sz(fmt); +#ifdef SMALL_TEST + size_t bank_size = (2048 / LOCAL_MEM_BANKS / fmt_byte_sz); +#else /* ! ifdef SMALL_TEST */ + size_t bank_size = (ctx->info.lmem_size / LOCAL_MEM_BANKS / fmt_byte_sz); +#endif /* SMALL_TEST */ + slice_num[0] = slice_num[1] = slice_num[2] = 1; + + size_t W_param = ctx->info.eu_num; + size_t csize_local = get_neuron_csize_local(ctx, 1, W_param, fmt); + size_t C_param = (K + W_param - 1) / W_param; + size_t size_A = M * ceiling_func_shift(C_param, NPU_SHIFT) * csize_local; + C_param = (N + W_param - 1) / W_param; + size_t size_B = K * ceiling_func_shift(C_param, NPU_SHIFT) * csize_local; + int res_byte_sz = 1; + if (fmt != CVK_FMT_BF16) { + // partial sum for 32bit output + res_byte_sz = sizeof(int); + } + + size_t size_C = res_byte_sz * M * ceiling_func_shift(C_param, NPU_SHIFT) * csize_local; + + DBG(" A size: %zu, B size: %zu, C size: %zu, bank: %zu\n", size_A, size_B, size_C, bank_size); + + if (size_A <= bank_size && size_B <= bank_size && size_C <= bank_size) { + return 0; + } else if (size_B <= bank_size) { + slice_num[0] = MAX(ceiling_func(size_A, bank_size), ceiling_func(size_C, bank_size)); + // split C local memory size + size_t slice_size = ceiling_func(M, slice_num[0]); + C_param = (N + ctx->info.eu_num - 1) / ctx->info.eu_num; + size_t slice_mem_C = slice_size * ceiling_func_shift(C_param, NPU_SHIFT) * csize_local; + if (slice_mem_C > bank_size) return 3; + return 1; + } else if (size_A <= bank_size) { + slice_num[1] = MAX(ceiling_func(size_B, bank_size), ceiling_func(size_C, bank_size)); + // Split more times in N, use load_stride maybe override previous data. + if (slice_num[1] > 1) { + int N_silce = N / slice_num[1]; + int N_floor = (N_silce / ctx->info.eu_num) * ctx->info.eu_num; + if (N_floor == 0) return 3; + slice_num[1] = ceiling_func(N, N_floor); + } + // + if (ceiling_func(ceiling_func(N, ctx->info.eu_num), NPU_SHIFT) < (int)slice_num[1]) return 3; + return 2; + } else { + return 3; + } +} + +static inline size_t get_max(size_t a, size_t b) { return a > b ? a : b; } + +static int load_matrix(cvk_context_t *ctx, cvk_ml_t *tlp, uint64_t gaddr_in, cvk_mg_shape_t shape, + cvk_mg_stride_t stride, cvk_fmt_t fmt) { + // load matrix data + cvk_mg_t ts_data; + ts_data.base_reg_index = 0; + ts_data.start_address = gaddr_in; + ts_data.stride = stride; + ts_data.fmt = fmt; + ts_data.shape = shape; + cvk_tdma_g2l_matrix_copy_param_t p1; + p1.src = &ts_data; + p1.dst = tlp; + if (fmt == CVK_FMT_BF16) { + ctx->ops->tdma_g2l_bf16_matrix_copy(ctx, &p1); + } else { + ctx->ops->tdma_g2l_matrix_copy(ctx, &p1); + } + + return 0; +} + +static int store_matrix(cvk_context_t *ctx, cvk_ml_t *tlp, uint64_t gaddr_in, cvk_mg_shape_t shape, + cvk_mg_stride_t stride, cvk_fmt_t fmt) { + cvk_mg_t ts_data; + ts_data.base_reg_index = 0; + ts_data.start_address = gaddr_in; + ts_data.fmt = fmt; + ts_data.shape = shape; + ts_data.stride = stride; + cvk_tdma_l2g_matrix_copy_param_t p3; + p3.src = tlp; + p3.dst = &ts_data; + if (fmt == CVK_FMT_BF16) { + ctx->ops->tdma_l2g_bf16_matrix_copy(ctx, &p3); + } else { + ctx->ops->tdma_l2g_matrix_copy(ctx, &p3); + } + + return 0; +} + +static int layerid = 0; +static void _matrix_multiplication(cvk_context_t *ctx, cvk_ml_t *tlp_a, cvk_ml_t *tlp_b, + cvk_ml_t *tlp_c, int ps32_mode) { + // mac + cvk_tiu_matrix_multiplication_param_t p2; + p2.bias = NULL; + p2.left = tlp_a; + p2.right = tlp_b; + p2.res = tlp_c; + p2.lshift_bits = 0; + p2.rshift_bits = 0; + if (tlp_c->fmt == CVK_FMT_BF16) { + p2.res_is_int8 = true; + } else { + // for int + p2.res_is_int8 = false; + } + p2.relu_enable = 0; + p2.add_result = 0; /*bf16 HW does not support add_result*/ + p2.ps32_mode = ps32_mode; + + p2.layer_id = layerid; + layerid++; + ctx->ops->tiu_matrix_multiplication(ctx, &p2); +} + +static void strategy_no_slice(cvk_context_t *ctx, size_t M, size_t N, size_t K, uint64_t gaddr_a, + uint64_t gaddr_b, uint64_t gaddr_c, cvk_fmt_t fmt) { + cvk_ml_t *tlp_a; + cvk_ml_t *tlp_b; + cvk_ml_t *tlp_c; + // size_t bank_size = ctx->info.lmem_size / LOCAL_MEM_BANKS * 2; + int fmt_byte_sz = get_fmt_byte_sz(fmt); + // size_t bank_size = ctx->info.lmem_size / LOCAL_MEM_BANKS; + int psmode = 0; // default for bf16 + + cvk_ml_shape_t shape_a = ctx->ops->ml_default_shape(ctx, M, K, fmt); + cvk_ml_shape_t shape_b = ctx->ops->ml_default_shape(ctx, K, N, fmt); + cvk_ml_shape_t shape_c = ctx->ops->ml_default_shape(ctx, M, N, fmt); + + tlp_a = ctx->ops->lmem_alloc_matrix(ctx, shape_a, fmt, CTRL_AL); + tlp_b = ctx->ops->lmem_alloc_matrix(ctx, shape_b, fmt, CTRL_AL); + if (fmt == CVK_FMT_BF16) { + tlp_c = ctx->ops->lmem_alloc_matrix(ctx, shape_c, fmt, CTRL_AL); + } else { + shape_c.n = shape_a.n; + shape_c.c = shape_b.c; + shape_c.w = shape_b.w; + shape_c.col = shape_b.col; + + tlp_c = ctx->ops->lmem_alloc_ps32_matrix(ctx, shape_c, fmt, CTRL_AL); + psmode = 2; + } + + cvk_mg_shape_t shape; + shape.row = tlp_a->shape.n; + shape.col = tlp_a->shape.col; + cvk_mg_stride_t stride; + stride.row = (uint32_t)K * fmt_byte_sz; + load_matrix(ctx, tlp_a, gaddr_a, shape, stride, fmt); + + shape.row = tlp_b->shape.n; + shape.col = tlp_b->shape.col; + stride.row = (uint32_t)N * fmt_byte_sz; + load_matrix(ctx, tlp_b, gaddr_b, shape, stride, fmt); + + // mac + _matrix_multiplication(ctx, tlp_a, tlp_b, tlp_c, psmode); + + if (fmt != CVK_FMT_BF16) { + tlp_c->shape.n *= sizeof(int); // partial sum for 32bit output + } + shape.row = tlp_c->shape.n; + shape.col = tlp_c->shape.col; + stride.row = (uint32_t)N * fmt_byte_sz; + store_matrix(ctx, tlp_c, gaddr_c, shape, stride, fmt); + + ctx->ops->lmem_free_matrix(ctx, tlp_c); + ctx->ops->lmem_free_matrix(ctx, tlp_b); + ctx->ops->lmem_free_matrix(ctx, tlp_a); +} + +static void strategy_slice_on_M(cvk_context_t *ctx, size_t M, size_t N, size_t K, uint64_t gaddr_a, + uint64_t gaddr_b, uint64_t gaddr_c, size_t slice_num, + cvk_fmt_t fmt) { + cvk_ml_t *tlp_b; + int fmt_byte_sz = get_fmt_byte_sz(fmt); + + cvk_ml_shape_t s_B = ctx->ops->ml_default_shape(ctx, K, N, fmt); + + tlp_b = ctx->ops->lmem_alloc_matrix(ctx, s_B, fmt, CTRL_AL); + + cvk_mg_shape_t shape; + shape.row = tlp_b->shape.n; + shape.col = tlp_b->shape.col; + cvk_mg_stride_t stride; + stride.row = (uint32_t)N * fmt_byte_sz; + load_matrix(ctx, tlp_b, gaddr_b, shape, stride, fmt); + + int pack_shift = 0; + int psmode = 0; // default for bf16 + for (size_t slice_idx = 0; slice_idx < slice_num; slice_idx++) { + cvk_ml_t *tlp_a; + size_t M_slice = M / slice_num + (slice_idx < M % slice_num); + cvk_ml_shape_t s_A = ctx->ops->ml_default_shape(ctx, M_slice, K, fmt); + tlp_a = ctx->ops->lmem_alloc_matrix(ctx, s_A, fmt, CTRL_AL); + + uint64_t A_slice_global_offset = + get_slice_global_offset(gaddr_a, slice_idx, 0, M, K, slice_num, 1, fmt); + + cvk_mg_shape_t shape; + shape.row = tlp_a->shape.n; + shape.col = tlp_a->shape.col; + cvk_mg_stride_t st_A; + st_A.row = (uint32_t)K * fmt_byte_sz; + load_matrix(ctx, tlp_a, A_slice_global_offset, shape, st_A, fmt); + + tlp_a->shape.n = M_slice; + tlp_a->shape.col = K; + + cvk_ml_shape_t s_C = ctx->ops->ml_default_shape(ctx, M_slice, N, fmt); + cvk_ml_t *tlp_c; + + if (fmt == CVK_FMT_BF16) { + tlp_c = ctx->ops->lmem_alloc_matrix(ctx, s_C, fmt, CTRL_AL); + } else { + s_C.n = tlp_a->shape.n; + s_C.c = tlp_b->shape.c; + s_C.w = tlp_b->shape.w; + s_C.col = tlp_b->shape.col; + + tlp_c = ctx->ops->lmem_alloc_ps32_matrix(ctx, s_C, fmt, CTRL_AL); + psmode = 2; + } + + uint64_t C_slice_global_offset = + get_slice_global_offset(gaddr_c, slice_idx, 0, M, N, slice_num, 1, fmt); + + if (fmt != CVK_FMT_BF16) { + // int32 pack + C_slice_global_offset = get_slice_global_offset(gaddr_c, 0, 0, M, N, slice_num, 1, fmt); + C_slice_global_offset += pack_shift; + pack_shift += (tlp_c->shape.n * tlp_c->shape.col * sizeof(int)); + // C_slice_global_offset = + // get_slice_global_offset(gaddr_c, slice_idx, 0, M * 2, N * 2, slice_num, 1, fmt); + } + + _matrix_multiplication(ctx, tlp_a, tlp_b, tlp_c, psmode); + + if (fmt != CVK_FMT_BF16) { + tlp_c->shape.n *= sizeof(int); // partial sum for 32bit output + } + shape.row = tlp_c->shape.n; + shape.col = tlp_c->shape.col; + stride.row = (uint32_t)N * fmt_byte_sz; // place with no tiling + store_matrix(ctx, tlp_c, C_slice_global_offset, shape, stride, fmt); + + ctx->ops->lmem_free_matrix(ctx, tlp_c); + ctx->ops->lmem_free_matrix(ctx, tlp_a); + } + ctx->ops->lmem_free_matrix(ctx, tlp_b); +} + +static void strategy_slice_on_N(cvk_context_t *ctx, size_t M, size_t N, size_t K, uint64_t gaddr_a, + uint64_t gaddr_b, uint64_t gaddr_c, size_t slice_num, + cvk_fmt_t fmt) { + cvk_ml_t *tlp_a; + int fmt_byte_sz = get_fmt_byte_sz(fmt); + + cvk_ml_shape_t s_a = ctx->ops->ml_default_shape(ctx, M, K, fmt); + tlp_a = ctx->ops->lmem_alloc_matrix(ctx, s_a, fmt, CTRL_AL); + + cvk_mg_stride_t stride; + stride.row = (uint32_t)K * fmt_byte_sz; + cvk_mg_shape_t shape; + shape.row = tlp_a->shape.n; + shape.col = tlp_a->shape.col; + load_matrix(ctx, tlp_a, gaddr_a, shape, stride, fmt); + + int pack_shift = 0; + int psmode = 0; // default for bf16 + for (size_t slice_idx = 0; slice_idx < slice_num; slice_idx++) { + size_t N_slice = N / slice_num + (slice_idx < N % slice_num); + + cvk_ml_shape_t s_b = ctx->ops->ml_default_shape(ctx, K, N_slice, fmt); + cvk_ml_t *tlp_b; + tlp_b = ctx->ops->lmem_alloc_matrix(ctx, s_b, fmt, CTRL_AL); + + uint64_t B_slice_global_offset = + get_slice_global_offset(gaddr_b, 0, slice_idx, K, N, 1, slice_num, fmt); + + // load b + stride.row = (uint32_t)N * fmt_byte_sz; + shape.row = tlp_b->shape.n; + shape.col = tlp_b->shape.col; + load_matrix(ctx, tlp_b, B_slice_global_offset, shape, stride, fmt); + + // c for answer + cvk_ml_shape_t s_c = ctx->ops->ml_default_shape(ctx, M, N_slice, fmt); + cvk_ml_t *tlp_c; + if (fmt == CVK_FMT_BF16) { + tlp_c = ctx->ops->lmem_alloc_matrix(ctx, s_c, fmt, CTRL_AL); + } else { + s_c.n = tlp_a->shape.n; + s_c.c = tlp_b->shape.c; + s_c.w = tlp_b->shape.w; + s_c.col = tlp_b->shape.col; + + tlp_c = ctx->ops->lmem_alloc_ps32_matrix(ctx, s_c, fmt, CTRL_AL); + psmode = 2; + } + + uint64_t C_slice_global_offset = + get_slice_global_offset(gaddr_c, 0, slice_idx, M, N, 1, slice_num, fmt); + if (fmt != CVK_FMT_BF16) { + // int32 pack + C_slice_global_offset = get_slice_global_offset(gaddr_c, 0, 0, M, N, 1, slice_num, fmt); + C_slice_global_offset += pack_shift; + pack_shift += tlp_c->shape.col; + // C_slice_global_offset = + // get_slice_global_offset(gaddr_c, 0, slice_idx, M*2, N*2, 1, slice_num, fmt); + } + + _matrix_multiplication(ctx, tlp_a, tlp_b, tlp_c, psmode); + + if (fmt != CVK_FMT_BF16) { + tlp_c->shape.n *= sizeof(int); // partial sum for 32bit output + } + shape.row = tlp_c->shape.n; + shape.col = tlp_c->shape.col; + stride.row = (uint32_t)N * fmt_byte_sz; + store_matrix(ctx, tlp_c, C_slice_global_offset, shape, stride, fmt); + + ctx->ops->lmem_free_matrix(ctx, tlp_c); + ctx->ops->lmem_free_matrix(ctx, tlp_b); + } + + ctx->ops->lmem_free_matrix(ctx, tlp_a); +} + +static void slice_split_strategy(cvk_context_t *ctx, size_t M, size_t N, size_t K, + size_t *slice_num, cvk_fmt_t fmt) { + int fmt_byte_sz = get_fmt_byte_sz(fmt); + size_t W_param = ctx->info.eu_num; + size_t channel_size_local = get_neuron_csize_local(ctx, 1, W_param, fmt); +#ifdef SMALL_TEST + size_t bank_size = (2048 / LOCAL_MEM_BANKS / fmt_byte_sz); +#else + size_t bank_size = (ctx->info.lmem_size / LOCAL_MEM_BANKS / fmt_byte_sz); +#endif + size_t bank_size_half = bank_size >> 1; + slice_num[0] = slice_num[1] = slice_num[2] = 1; + + // input blob + size_t C_param = (K + W_param - 1) / W_param; + size_t local_size_A = M * (ceiling_func_shift(C_param, NPU_SHIFT)) * channel_size_local; + size_t slice_num_A = (local_size_A + bank_size_half - 1) / (bank_size_half); + size_t col_slice_time_A = ceiling_func_shift(C_param, NPU_SHIFT); + size_t row_slice_time_A = (slice_num_A < M) ? slice_num_A : M; + + // weight blob + C_param = (N + W_param - 1) / W_param; + size_t local_size_B = K * (ceiling_func_shift(C_param, NPU_SHIFT)) * channel_size_local; + size_t slice_num_B = (local_size_B + bank_size_half - 1) / bank_size_half; + size_t row_slice_time_B = (slice_num_B < K) ? slice_num_B : K; + + // output blob + C_param = (N + W_param - 1) / W_param; + // multi 2 for simulating result add + int outputs_nr = 2; + if (fmt != CVK_FMT_BF16) { + // int8 output 32bit result + // outputs_nr = 4; + } + size_t local_size_C = (M + 1) * (ceiling_func_shift(C_param, NPU_SHIFT)) * channel_size_local; + size_t slice_num_C = (local_size_C + bank_size * outputs_nr - 1) / (bank_size * outputs_nr); + size_t col_slice_time_C = ceiling_func_shift(C_param, NPU_SHIFT); + + // A + if (col_slice_time_A == 0) { + slice_num[0] = row_slice_time_A; + } else { + if (col_slice_time_A < slice_num_A) { + slice_num[0] = (slice_num_A + col_slice_time_A - 1) / col_slice_time_A; + } else { + slice_num[0] = 1; + slice_num[2] = slice_num_A; + } + } + + // C + if ((slice_num_C > slice_num[0]) && col_slice_time_C) { + size_t tmp = (slice_num_C + slice_num[0] - 1) / slice_num[0]; + slice_num[1] = (col_slice_time_C > tmp) ? tmp : col_slice_time_C; + } + + // B + if (slice_num_B > slice_num[1]) { + size_t tmp = (slice_num_B + slice_num[1] - 1) / slice_num[1]; + slice_num[2] = get_max(slice_num[2], (row_slice_time_B > tmp) ? tmp : row_slice_time_B); + } + // fine-tuning + size_t matrix_shape[3] = {1, 1, 1}; + while (true) { + matrix_shape[0] = (M + slice_num[0] - 1) / slice_num[0]; + matrix_shape[2] = (N + slice_num[1] - 1) / slice_num[1]; + matrix_shape[1] = (K + slice_num[2] - 1) / slice_num[2]; + size_t C_param_input_col = (matrix_shape[1] + W_param - 1) / W_param; + size_t C_param_weight_col = (matrix_shape[2] + W_param - 1) / W_param; + + size_t local_size_B = + matrix_shape[1] * (ceiling_func_shift(C_param_weight_col, NPU_SHIFT)) * channel_size_local; + size_t local_size_C = + matrix_shape[0] * (ceiling_func_shift(C_param_weight_col, NPU_SHIFT)) * channel_size_local; + size_t local_size_A = + matrix_shape[0] * (ceiling_func_shift(C_param_input_col, NPU_SHIFT)) * channel_size_local; + bool slicing_success = (local_size_A <= bank_size_half) && + (local_size_C <= bank_size * outputs_nr) && + (local_size_B <= bank_size_half); + + if (slicing_success) { + if (slice_num[1] > 1) { + int N_silce = N / slice_num[1]; + int N_floor = (N_silce / ctx->info.eu_num) * ctx->info.eu_num; + ASSERT(N_floor); + slice_num[1] = ceiling_func(N, N_floor); + } +#if 0 // def DEBUG_LOCAL + size_t bias_local_size = + (ceiling_func_shift(C_param_weight_col, NPU_SHIFT)) * channel_size_local; + // DBG("multi-dim slicing:\n"); + DBG("local_size_B = %lu\n", local_size_B); + DBG("local_size_C = %lu\n", local_size_C); + DBG("local_size_A = %lu\n", local_size_A); + DBG("bias_local_size = %lu\n", bias_local_size); +#endif + return; + } else if (local_size_A > bank_size_half) { + slice_num[2]++; + } else if (local_size_B > bank_size_half) { + slice_num[2]++; + } else if (local_size_C > 2 * bank_size) { + slice_num[0]++; + } + } +} + +static void strategy_slice_on_multidim_init(cvk_context_t *ctx, gaddr_t *slice_global_offset, + size_t *matrix_shape, size_t *slice_row_stride, + cvk_fmt_t fmt) { + int fmt_byte_sz = get_fmt_byte_sz(fmt); + gaddr_t global_offset_A = slice_global_offset[0]; + gaddr_t global_offset_B = slice_global_offset[1]; + size_t row_num_A = matrix_shape[0]; + size_t col_num_A = matrix_shape[1]; + size_t col_num_B = matrix_shape[2]; + + cvk_ml_shape_t s_A, s_B; + s_A = ctx->ops->ml_default_shape(ctx, row_num_A, col_num_A, fmt); + s_B = ctx->ops->ml_default_shape(ctx, col_num_A, col_num_B, fmt); + + cvk_mg_stride_t st_A, st_B; + st_A.row = (uint32_t)slice_row_stride[0] * fmt_byte_sz; + st_B.row = (uint32_t)slice_row_stride[1] * fmt_byte_sz; + +#ifdef SMALL_TEST + size_t bank_size = 2048 / LOCAL_MEM_BANKS; +#else + size_t bank_size = ctx->info.lmem_size / LOCAL_MEM_BANKS * 2; +#endif + cvk_ml_t *tl_A = bmk1880v2_matrix_lmem_prealloc_align(ctx, &matrix_lmem[0], 0, s_A, fmt, CTRL_AL); + cvk_ml_t *tl_B = + bmk1880v2_matrix_lmem_prealloc_align(ctx, &matrix_lmem[1], bank_size, s_B, fmt, CTRL_AL); + cvk_mg_shape_t shape; + shape.row = tl_A->shape.n; + shape.col = tl_A->shape.col; + load_matrix(ctx, tl_A, global_offset_A, shape, st_A, fmt); + + shape.row = tl_B->shape.n; + shape.col = tl_B->shape.col; + load_matrix(ctx, tl_B, global_offset_B, shape, st_B, fmt); + // DBG("0->load from %u/%u,off %lu/%lu\n", tl_A->start_address, tl_B->start_address, + // global_offset_A, + // global_offset_B); + + bool is_alloc_from_stack = true; + bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_B); + bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_A); +} + +static int strategy_slice_on_multi_dimension_internal( + cvk_context_t *ctx, size_t *slice_idx, size_t *slice_num, gaddr_t *slice_global_offset, + size_t *matrix_shape, gaddr_t *slice_global_offset_next, size_t *matrix_shape_next, + size_t *slice_row_stride, cvk_fmt_t fmt) { + int fmt_byte_sz = get_fmt_byte_sz(fmt); + size_t row_num_A = matrix_shape[0]; + size_t col_num_A = matrix_shape[1]; + size_t col_num_B = matrix_shape[2]; + + gaddr_t global_offset_next_A = slice_global_offset_next[0]; + gaddr_t global_offset_next_B = slice_global_offset_next[1]; + + size_t row_num_next_A = matrix_shape_next[0]; + size_t col_num_next_A = matrix_shape_next[1]; + size_t col_num_next_B = matrix_shape_next[2]; + + cvk_ml_shape_t s_next_A = ctx->ops->ml_default_shape(ctx, row_num_next_A, col_num_next_A, fmt); + cvk_ml_shape_t s_next_B = ctx->ops->ml_default_shape(ctx, col_num_next_A, col_num_next_B, fmt); + cvk_ml_shape_t s_A, s_B, s_C; + + s_A = ctx->ops->ml_default_shape(ctx, row_num_A, col_num_A, fmt); + s_B = ctx->ops->ml_default_shape(ctx, col_num_A, col_num_B, fmt); + s_C = ctx->ops->ml_default_shape(ctx, row_num_A, col_num_B, fmt); + + // int partition = 2; // 2 means one for A/B, another for C with double output + // if (fmt != CVK_FMT_BF16) { + // partition = 6; + //} + // size_t bank_size = ctx->info.lmem_size / LOCAL_MEM_BANKS * 2; +#ifdef SMALL_TEST + size_t bank_size = 2048 / LOCAL_MEM_BANKS; +#else + size_t bank_size = ctx->info.lmem_size / LOCAL_MEM_BANKS * 2; +#endif + size_t hf_bsize = bank_size / 2; + size_t cur = slice_idx[1] % 2; + size_t next = (slice_idx[1] + 1) % 2; + + int output_nr = 2; // 2 means output with low part and high part + int psmode = 1; // default for bf16 + if (fmt != CVK_FMT_BF16) { + // output_nr = 4; // 4 for 32bit output with 4 * 1 byte output + psmode = 3; + } + + cvk_mg_stride_t st_A, st_C, st_B; + st_A.row = (uint32_t)slice_row_stride[0] * fmt_byte_sz; + st_C.row = (uint32_t)slice_row_stride[1] * fmt_byte_sz; + st_B.row = (uint32_t)slice_row_stride[1] * fmt_byte_sz; + + cvk_ml_t *tl_A = + bmk1880v2_matrix_lmem_prealloc_align(ctx, &matrix_lmem[0], hf_bsize * cur, s_A, fmt, CTRL_AL); + cvk_ml_t *tl_next_A = bmk1880v2_matrix_lmem_prealloc_align(ctx, &matrix_lmem[1], hf_bsize * next, + s_next_A, fmt, CTRL_AL); + cvk_ml_t *tl_B = bmk1880v2_matrix_lmem_prealloc_align( + ctx, &matrix_lmem[2], bank_size + hf_bsize * cur, s_B, fmt, CTRL_AL); + cvk_ml_t *tl_next_B = bmk1880v2_matrix_lmem_prealloc_align( + ctx, &matrix_lmem[3], bank_size + hf_bsize * next, s_next_B, fmt, CTRL_AL); + cvk_ml_t *tl_C = bmk1880v2_matrix_lmem_prealloc_align(ctx, &matrix_lmem[4], output_nr * bank_size, + s_C, fmt, CTRL_AL); + + ctx->ops->parallel_enable(ctx); + + if (slice_num[2] - 1 > slice_idx[1]) { + st_A.row = (uint32_t)slice_row_stride[0] * fmt_byte_sz; + st_B.row = (uint32_t)slice_row_stride[1] * fmt_byte_sz; + + cvk_mg_shape_t shape; + shape.row = tl_next_A->shape.n; + shape.col = tl_next_A->shape.col; + load_matrix(ctx, tl_next_A, global_offset_next_A, shape, st_A, fmt); + + shape.row = tl_next_B->shape.n; + shape.col = tl_next_B->shape.col; + load_matrix(ctx, tl_next_B, global_offset_next_B, shape, st_B, fmt); + + // DBG("do %u/%u ", tl_A->start_address, tl_B->start_address); +#define PS32_CTRL_RA (3) /* normal case */ +#define PS32_CTRL_NULL (2) /* first one */ + int pint32_t_status = slice_idx[1] > 0 ? PS32_CTRL_RA : PS32_CTRL_NULL; + _matrix_multiplication(ctx, tl_A, tl_B, tl_C, pint32_t_status); + // DBG(">load from %u/%u off %lu/%lu s(%d)\n", tl_next_A->start_address, + // tl_next_B->start_address, + // global_offset_next_A, global_offset_next_B, pint32_t_status); + } + + if (slice_idx[1] == slice_num[2] - 1) { + // last one + // not using ps mode 1 cuz it could saturate from 32bit to 16 bit + _matrix_multiplication(ctx, tl_A, tl_B, tl_C, psmode); + } + + ctx->ops->parallel_disable(ctx); + + if (slice_idx[1] == slice_num[2] - 1) { + // last one + cvk_mg_shape_t shape; + if (fmt != CVK_FMT_BF16) { + tl_C->shape.n *= sizeof(int); // partial sum for 32bit output + } + shape.row = tl_C->shape.n; + shape.col = tl_C->shape.col; + st_C.row = (uint32_t)slice_row_stride[1] * fmt_byte_sz; + + store_matrix(ctx, tl_C, slice_global_offset[2], shape, st_C, fmt); + // DBG("local memory a/b/c is %u/%u/%u, store to slice_global_offset[2] %lu\n", + // tl_A->start_address, tl_B->start_address, tl_C->start_address, slice_global_offset[2]); + } + + bool is_alloc_from_stack = true; + + if (cur == 0) { + bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_next_B); + bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_B); + bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_next_A); + bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_A); + } else { + bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_B); + bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_next_B); + bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_A); + bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_next_A); + } + + return 0; +} + +static void strategy_slice_on_multi_dimension(cvk_context_t *ctx, gaddr_t global_offset_A, + gaddr_t global_offset_B, gaddr_t global_offset_C, + size_t M, size_t N, size_t K, size_t *slice_num, + cvk_fmt_t fmt) { + size_t slice_row_stride[4] = {0, 0, 0, 0}; + slice_row_stride[0] = K; + slice_row_stride[1] = N; + + gaddr_t slice_global_offset[3] = {0, 0, 0}; + gaddr_t slice_global_offset_next[4] = {0, 0, 0, 0}; + size_t slice_idx[3] = {0, 0, 0}; + size_t matrix_shape[3] = {0, 0, 0}; + size_t matrix_shape_next[3] = {0, 0, 0}; + int slice_idx_0 = slice_idx[0]; + int slice_idx_2 = slice_idx[2]; + int pack_shift = 0; + for (slice_idx[0] = 0; slice_idx[0] < slice_num[0]; slice_idx[0]++) { + matrix_shape[0] = M / slice_num[0] + (slice_idx[0] < M % slice_num[0]); + matrix_shape_next[0] = M / slice_num[0] + (0 + slice_idx[0] < M % slice_num[0]); + for (slice_idx[2] = 0; slice_idx[2] < slice_num[1]; slice_idx[2]++) { + matrix_shape[2] = N / slice_num[1] + (slice_idx[2] < N % slice_num[1]); + matrix_shape_next[2] = N / slice_num[1] + (0 + slice_idx[1] < N % slice_num[1]); + for (slice_idx[1] = 0; slice_idx[1] < slice_num[2]; slice_idx[1]++) { + matrix_shape[1] = K / slice_num[2] + (slice_idx[1] < K % slice_num[2]); + matrix_shape_next[1] = K / slice_num[2] + (1 + slice_idx[1] < K % slice_num[2]); + slice_global_offset[0] = get_slice_global_offset( + global_offset_A, slice_idx[0], slice_idx[1], M, K, slice_num[0], slice_num[2], fmt); + slice_global_offset[1] = get_slice_global_offset( + global_offset_B, slice_idx[1], slice_idx[2], K, N, slice_num[2], slice_num[1], fmt); + // the low 8-bits of C + if (fmt == CVK_FMT_BF16) { + slice_global_offset[2] = get_slice_global_offset( + global_offset_C, slice_idx[0], slice_idx[2], M, N, slice_num[0], slice_num[1], fmt); + } else { + slice_global_offset[2] = get_slice_global_offset( + global_offset_C, slice_idx_0, slice_idx_2, M, N, slice_num[0], slice_num[1], fmt); + if (slice_idx[1] == slice_num[2] - 1) { + // only shift in real store + slice_global_offset[2] += pack_shift; + // FIXME: slice N, currently ONLY slice M and K + size_t row_num_A = matrix_shape[0]; + size_t col_num_B = matrix_shape[2]; + pack_shift += (row_num_A * col_num_B * sizeof(int)); + } + } + + slice_global_offset_next[0] = get_slice_global_offset( + global_offset_A, slice_idx[0], slice_idx[1] + 1, M, K, slice_num[0], slice_num[2], fmt); + slice_global_offset_next[1] = get_slice_global_offset( + global_offset_B, slice_idx[1] + 1, slice_idx[2], K, N, slice_num[2], slice_num[1], fmt); + // DBG("=>(%s)slice_global_offset[0](%lu)/slice_global_offset[1](%lu) for slice_idx[1](%lu) + // " + // "== 0\n" + // ", (%s)slice_global_offset[2](%lu) for slice_idx[1](%lu) == slice_num[2](%lu) - 1\n" + // ", (%s)(next)slice_global_offset_next[0](%lu)/slice_global_offset_next[1](%lu) for " + // "slice_num[2](%lu) - 1 > slice_idx[1](%lu)\n" + // "next ctrl:%s, store ctrl:%s\n", + // slice_idx[1] == 0 ? "en" : " ", slice_global_offset[0], slice_global_offset[1], + // slice_idx[1], slice_idx[1] == slice_num[2] - 1 ? "en" : " ", slice_global_offset[2], + // slice_idx[1], slice_num[2], slice_num[2] - 1 > slice_idx[1] ? "en" : " ", + // slice_global_offset_next[0], slice_global_offset_next[1], slice_num[2], slice_idx[1], + // (slice_idx[1] > 0) ? "CTRL_RA" : "CTRL_NULL", + // (slice_num[2] > 1) ? "CTRL_RA" : "CTRL_NULL"); + + if (slice_idx[1] == 0) { + strategy_slice_on_multidim_init(ctx, slice_global_offset, matrix_shape, slice_row_stride, + fmt); + } + + strategy_slice_on_multi_dimension_internal(ctx, slice_idx, slice_num, slice_global_offset, + matrix_shape, slice_global_offset_next, + matrix_shape_next, slice_row_stride, fmt); + } + } + } +} + +size_t *bmblas_gemm(cvk_context_t *ctx, size_t M, size_t N, size_t K, uint64_t gaddr_a, + uint64_t gaddr_b, uint64_t gaddr_c, cvk_fmt_t fmt) { + size_t slice_num[3] = {1, 1, 1}; + ASSERT(slice_num[0] <= M && slice_num[0] >= 1); + ASSERT(slice_num[1] <= N && slice_num[1] >= 1); + ASSERT(slice_num[2] <= K && slice_num[2] >= 1); + + size_t strategy = get_slice_num(ctx, M, N, K, slice_num, fmt); + // printf("strategy: %lu\n slice %lu %lu %lu\n", strategy, slice_num[0], slice_num[1], + // slice_num[2]); + + switch (strategy) { + case 0: { + strategy_no_slice(ctx, M, N, K, gaddr_a, gaddr_b, gaddr_c, fmt); + } break; + case 1: { + strategy_slice_on_M(ctx, M, N, K, gaddr_a, gaddr_b, gaddr_c, slice_num[0], fmt); + } break; + case 2: { + strategy_slice_on_N(ctx, M, N, K, gaddr_a, gaddr_b, gaddr_c, slice_num[1], fmt); + } break; + case 3: { + slice_split_strategy(ctx, M, N, K, slice_num, fmt); + // printf("slice all, %lu %lu %lu\n", slice_num[0], slice_num[1], slice_num[2]); + + strategy_slice_on_multi_dimension(ctx, gaddr_a, gaddr_b, gaddr_c, M, N, K, slice_num, fmt); + } + default: + break; + } + // 3 indicate M N K + int slice_num_len = 4 * sizeof(size_t); + size_t *_slice_num = (size_t *)malloc(slice_num_len); + memcpy(_slice_num, slice_num, 3 * sizeof(size_t)); + _slice_num[3] = strategy; + return _slice_num; +} + +size_t *cvm_gemm(cvk_context_t *ctx, gaddr_t bottom_data_gaddr, gaddr_t weight_data_gaddr, + gaddr_t top_data_gaddr, int in_row, int in_col, int out_col, cvk_fmt_t fmt) { + size_t *slice_num = NULL; + if (0) { + // backend impelement + cvm_fc_forward_kernel(ctx, 0, bottom_data_gaddr, weight_data_gaddr, GADDR_INVALID, + top_data_gaddr, in_row, in_col, out_col, 0, 0, 0); + } else { + slice_num = bmblas_gemm(ctx, in_row, out_col, in_col, bottom_data_gaddr, weight_data_gaddr, + top_data_gaddr, fmt); + } + return slice_num; +} + +int cvm_combin_gemm_i8(size_t *slice_num, uint8_t *i8_C, uint32_t *i32_C, int M, int N) { + int bstride = M * N; + int size = bstride; + + int strategy = slice_num[3]; + int chunks = slice_num[0] * slice_num[1] * slice_num[2]; + int chunk_size = M * N / chunks; + size = chunk_size; + bstride = chunk_size; + if (strategy == 0 || strategy == 2) { + // slice N + int pack_shift = 0; + int N_slice_cnt = 0; + for (int tiling = 0; tiling < chunks; tiling++) { + size_t N_slice = N / slice_num[1] + (tiling < (int)(N % slice_num[1])); + chunk_size = N_slice * M; + size = chunk_size; + bstride = M * N; + for (int m = 0; m < (int)M; m++) { + for (int n = 0; n < (int)N_slice; n++) { + int shift = N_slice_cnt + m * N + n; + i32_C[shift] = (i8_C[shift + bstride * 0]) | (i8_C[shift + bstride * 1] << 8) | + (i8_C[shift + bstride * 2] << 16) | (i8_C[shift + bstride * 3] << 24); + } + } + pack_shift += size; + N_slice_cnt += N_slice; + } + } else if (strategy == 1) { + int pack_shift = 0; + for (int tiling = 0; tiling < chunks; tiling++) { + size_t M_slice = M / slice_num[0] + (tiling < (int)(M % slice_num[0])); + chunk_size = M_slice * N; + size = chunk_size; + bstride = chunk_size; + for (int i = 0; i < size; i++) { + i32_C[pack_shift + i] = (i8_C[pack_shift * sizeof(int) + i + bstride * 0]) | + (i8_C[pack_shift * sizeof(int) + i + bstride * 1] << 8) | + (i8_C[pack_shift * sizeof(int) + i + bstride * 2] << 16) | + (i8_C[pack_shift * sizeof(int) + i + bstride * 3] << 24); + } + pack_shift += size; + } + } else if (strategy == 3) { + // tiling all, it MUST tiling M/K ONLY + // FIXME: tiling N + int pack_shift = 0; + for (int tiling = 0; tiling < (int)slice_num[0]; tiling++) { + size_t M_slice = M / slice_num[0] + (tiling < (int)(M % slice_num[0])); + int size = M_slice * N; + int bstride = size; + for (int i = 0; i < size; i++) { + i32_C[pack_shift + i] = (i8_C[pack_shift * sizeof(int) + i + bstride * 0]) | + (i8_C[pack_shift * sizeof(int) + i + bstride * 1] << 8) | + (i8_C[pack_shift * sizeof(int) + i + bstride * 2] << 16) | + (i8_C[pack_shift * sizeof(int) + i + bstride * 3] << 24); + } + pack_shift += size; + } + } + return 0; +} diff --git a/cvimath/src/blas_cpu.cpp b/cvimath/src/blas_cpu.cpp new file mode 100644 index 000000000..7b091116b --- /dev/null +++ b/cvimath/src/blas_cpu.cpp @@ -0,0 +1,82 @@ +#include + +#include +#ifdef __ARM_ARCH +#include +#endif + +template +void k_selection_sort_index(T *array, uint32_t *index, T *value, const uint32_t array_size, + const uint32_t k) { + for (uint32_t i = 0; i < k; i++) { + int largest = 0; + for (uint32_t j = 0; j < array_size; j++) { + if (array[j] > array[largest]) { + largest = j; + } + } + value[i] = array[largest]; + index[i] = largest; + array[largest] = 0; + } +} + +inline uint32_t dot(uint8_t *a, uint8_t *b, uint32_t data_length) { + uint32_t dot_result = 0; + for (uint32_t i = 0; i < data_length; i++) { + dot_result += ((short)a[i] * b[i]); + } + return dot_result; +} + +inline int32_t dot_i8(int8_t *a, int8_t *b, uint32_t data_length) { + int32_t dot_result = 0; + for (uint32_t i = 0; i < data_length; i++) { + dot_result += ((short)a[i] * b[i]); + } + return dot_result; +} + +void cvm_gen_precached_i8_unit_length(int8_t *precached, float *unit_precached_arr, + const uint32_t data_length, const uint32_t data_num) { + for (uint32_t i = 0; i < data_num; i++) { + int8_t *fb_offset = precached + i * data_length; + unit_precached_arr[i] = dot_i8(fb_offset, fb_offset, data_length); + unit_precached_arr[i] = sqrt(unit_precached_arr[i]); + } +} + +void cvm_gen_precached_u8_unit_length(uint8_t *precached, float *unit_precached_arr, + const uint32_t data_length, const uint32_t data_num) { + for (uint32_t i = 0; i < data_num; i++) { + uint8_t *fb_offset = precached + i * data_length; + unit_precached_arr[i] = dot(fb_offset, fb_offset, data_length); + unit_precached_arr[i] = sqrt(unit_precached_arr[i]); + } +} + +void cvm_cpu_i8data_ip_match(int8_t *feature, int8_t *precached, float *unit_precached_arr, + uint32_t *k_index, float *k_value, float *buffer, + const uint32_t data_length, const uint32_t data_num, + const uint32_t k) { + float unit_feature = (float)dot_i8(feature, feature, data_length); + unit_feature = sqrt(unit_feature); + for (uint32_t i = 0; i < data_num; i++) { + buffer[i] = dot_i8(feature, precached + i * data_length, data_length) / + (unit_feature * unit_precached_arr[i]); + } + k_selection_sort_index(buffer, k_index, k_value, data_num, k); +} + +void cvm_cpu_u8data_ip_match(uint8_t *feature, uint8_t *precached, float *unit_precached_arr, + uint32_t *k_index, float *k_value, float *buffer, + const uint32_t data_length, const uint32_t data_num, + const uint32_t k) { + float unit_feature = (float)dot(feature, feature, data_length); + unit_feature = sqrt(unit_feature); + for (uint32_t i = 0; i < data_num; i++) { + buffer[i] = dot(feature, precached + i * data_length, data_length) / + (unit_feature * unit_precached_arr[i]); + } + k_selection_sort_index(buffer, k_index, k_value, data_num, k); +} \ No newline at end of file diff --git a/cvimath/src/chl_quan.cpp b/cvimath/src/chl_quan.cpp new file mode 100644 index 000000000..e97c3ea12 --- /dev/null +++ b/cvimath/src/chl_quan.cpp @@ -0,0 +1,118 @@ +#include + +#include +#include +#include +#include + +void cvm_get_chl_quan(float real_multiplier, uint32_t *quantized_multiplier, int *right_shift) { + if (real_multiplier <= 0.f || real_multiplier > 1.f) { + std::cerr << "Multiplier should be bigger than 0, smaller or euqal to 1." << std::endl; + *quantized_multiplier = 0; + *right_shift = 0; + return; + } else if (real_multiplier == 1.f) { + *quantized_multiplier = (uint32_t)(1ll << 31) - 1; + *right_shift = 0; + } else { + int s = 0; + // We want to bring the real multiplier into the interval [1/2, 1). + // We can do so by multiplying it by two, and recording how many times + // we multiplied by two so that we can compensate that by a right + // shift by the same amount. + while (real_multiplier < 0.5f) { + real_multiplier *= 2.0f; + s++; + } + // Now that the real multiplier is in [1/2, 1), we convert it + // into a fixed-point number. + int64_t q = static_cast(round(real_multiplier * (1ll << 31))); + assert(q <= (1ll << 31)); + // Handle the special case when the real multiplier was so close to 1 + // that its fixed-point approximation was undistinguishable from 1. + // We handle this by dividing it by two, and remembering to decrement + // the right shift amount. + if (q == (1ll << 31)) { + q /= 2; + s--; + } + assert(s >= 0); + assert(q <= (int64_t)LONG_MAX); + *quantized_multiplier = (uint32_t)q; + *right_shift = s; + } +} + +inline void cvm_pack_per_chan_cal_data(uint32_t channels, bool has_bias, int32_t *bias, + uint32_t *multiplier, int8_t *shift, uint8_t *packed_data) { + uint8_t *ptr = packed_data; + + for (uint32_t i = 0; i < channels; i++) { + if (has_bias) { + uint32_t val = (uint32_t)bias[i]; + *ptr = val & 0xff; + ptr++; + *ptr = (val >> 8) & 0xff; + ptr++; + *ptr = (val >> 16) & 0xff; + ptr++; + *ptr = (val >> 24) & 0xff; + ptr++; + } + + { + uint32_t val = multiplier[i]; + *ptr = val & 0xff; + ptr++; + *ptr = (val >> 8) & 0xff; + ptr++; + *ptr = (val >> 16) & 0xff; + ptr++; + *ptr = (val >> 24) & 0xff; + ptr++; + } + + { + uint8_t val = shift[i]; + *ptr = val; + ptr++; + } + } +} + +void cvm_fill_chl_quan_data(const uint32_t c, const uint32_t quantized_multiplier, + const int right_shift, uint8_t *cal_data, int32_t *bias_data, + bool has_bias) { + // Create tl_multiplier + uint32_t *multiplier_data = new uint32_t[c]; + int8_t *shift_data = new int8_t[c]; + for (unsigned int i = 0; i < c; ++i) { + // multipliers typically range in [2^30 ; 2^31 - 1]. + // Values in [0, 2^30 - 1] are normally unused, but harmless. + // Thus a good way to randomize multipliers is to subtract from them + // a random value smaller than 2^30 but still significant compared to it. + multiplier_data[i] = quantized_multiplier; + + // Our H/W only supports right shift + shift_data[i] = right_shift > 0 ? right_shift : 0; + +#ifdef ENABLE_DEBUG_MSG + printf(" [oc=%d] multiplier_data %d, shift_data %d\n", i, p_param->multiplier_data[i], + p_param->shift_data[i]); +#endif + } + + cvm_pack_per_chan_cal_data(c, has_bias, bias_data, multiplier_data, shift_data, cal_data); + delete[] multiplier_data; + delete[] shift_data; +} + +uint8_t *cvm_get_chl_quan_data(const uint32_t c, const uint32_t quantized_multiplier, + const int &right_shift, int32_t *bias_data, bool has_bias) { + const int per_chan_cal_data_size = + has_bias ? CVK_MULTIPLIER_BIAS_PACKED_DATA_SIZE : CVK_MULTIPLIER_ONLY_PACKED_DATA_SIZE; + const int cal_data_size = c * per_chan_cal_data_size; + uint8_t *cal_data = (uint8_t *)malloc(cal_data_size); + cvm_fill_chl_quan_data(c, quantized_multiplier, right_shift, cal_data, bias_data, has_bias); + return cal_data; +} diff --git a/cvimath/src/common.c b/cvimath/src/common.c new file mode 100644 index 000000000..534d489e1 --- /dev/null +++ b/cvimath/src/common.c @@ -0,0 +1,1032 @@ +/** + * \breif common wrap function for lut + */ +#include +#include "gen_lut.h" // NOLINT + +void cvm_table_shape(cvk_context_t* ctx, cvk_tl_shape_t* s) { + // MUST valid + ASSERT(s); + + uint32_t npu_num = ctx->info.npu_num; + s->n = 1; + s->c = npu_num; + s->h = cvm_table_h(); + s->w = cvm_table_w(); // hard code for hw, hw:32x8 +} + +void cvm_table_check(cvk_tl_t* IN tl_ifmap, cvk_tl_t* tbl_answer, cvk_tl_t* tbl_answer_mantissa, + cvk_tl_t* OUT tl_ofmap_bf16) { + // MUST valid + ASSERT(tl_ofmap_bf16); + ASSERT(tl_ifmap); + ASSERT(tbl_answer); + ASSERT(tbl_answer_mantissa); + + // shape should be same + + // TODO table channel should be great equal input + + // currently ONLY support bf16 + ASSERT(tl_ifmap->fmt == CVK_FMT_BF16); + ASSERT(tbl_answer->fmt == CVK_FMT_BF16); + ASSERT(tbl_answer_mantissa->fmt == CVK_FMT_BF16); + ASSERT(tl_ofmap_bf16->fmt == CVK_FMT_BF16); + + // table shape should fix + ASSERT(is_1880v2_tbl_shape(&tbl_answer->shape)); + ASSERT(is_1880v2_tbl_shape(&tbl_answer_mantissa->shape)); +} + +static void _bf16_table_check(cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_buf, cvk_tl_t* tbl_answer, + cvk_tl_t* tbl_answer_mantissa, cvk_tl_t* OUT tl_ofmap_bf16) { + // check table / input / output + cvm_table_check(tl_ifmap, tbl_answer, tbl_answer_mantissa, tl_ofmap_bf16); + + // check buf + ASSERT(tl_buf); + ASSERT(tl_buf->fmt == CVK_FMT_BF16); + + // TODO: remove ASSERT for -O2 +} + +int _cvm_lut_exp_mantissa(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_buf, + cvk_tl_t* tbl_answer, cvk_tl_t* tbl_answer_mantissa, + cvk_tl_t* OUT tl_ofmap_bf16, bool is_dirty_ifmap) { + cvk_tl_t* tmp = tl_buf; + if (is_dirty_ifmap) { + tmp = tl_ifmap; + } + + // check table / input / output + _bf16_table_check(tl_ifmap, tl_ifmap, tbl_answer, tbl_answer_mantissa, tl_ofmap_bf16); + + // issue lut cmd + cvk_tdma_l2l_tensor_copy_param_t p10; + // remove low 8 bits by int8 copy with stride + // ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = false; + + // ops->tiu_lookup_table(ctx, &p12); + + // ops->tiu_lookup_table(ctx, &p12); + + // sqrt = (2^exp) * mantissa + cvk_tiu_mul_param_t p1; + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_ofmap_bf16; + p1.b_is_const = 0; + p1.b = tmp; + p1.rshift_bits = 0; + p1.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p1); + + return 0; +} + +int cvm_lut_exp_mantissa(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_buf, + cvk_tl_t* tbl_answer, cvk_tl_t* tbl_answer_mantissa, + cvk_tl_t* OUT tl_ofmap_bf16) { + return _cvm_lut_exp_mantissa(ctx, tl_ifmap, tl_buf, tbl_answer, tbl_answer_mantissa, + tl_ofmap_bf16, false); +} + +// \int8_rnd_mode 1 is rounding to 0, e.g: 1.3->1, -1.3->-1, -1.5->-2 +// 0 is rounding to nearset even, e.g: 1.3->1, -1.3->-1, -1.7->-2 +// \return convert bf16 as int8 and locate to lower part +// e.g.: 24 = 0x18 = 1.5* 2^4 = 0x41C0 +// cvm_get_tbl_idx(0x41C0,CVK_FMT_U8) = 0x0018 +void _cvm_get_tbl_idx(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16, + cvk_fmt_t src_fmt, int int8_rnd_mode) { + ASSERT((int8_rnd_mode == 0 || int8_rnd_mode == 1) && "only support 2 mode"); + + ASSERT(tl_ifmap->fmt == CVK_FMT_BF16); + ASSERT(tl_ofmap_bf16->fmt == CVK_FMT_BF16); + + // get index + cvk_tl_shape_t tl_ofmap_A_idx_int8_shape = {tl_ofmap_bf16->shape.n, tl_ofmap_bf16->shape.c, + tl_ofmap_bf16->shape.h * tl_ofmap_bf16->shape.w, 1}; + + cvk_tl_t dst; + bmk1880v2_tensor_lmem_s_copy(&dst, tl_ofmap_bf16); + dst.start_address = tl_ofmap_bf16->start_address; + dst.fmt = src_fmt; + dst.shape = tl_ofmap_A_idx_int8_shape; + dst.stride = ctx->ops->tl_default_stride(ctx, dst.shape, dst.fmt, CTRL_NULL); + dst.stride.h = dst.stride.h * 2; + dst.int8_rnd_mode = int8_rnd_mode; + + cvk_tdma_l2l_tensor_copy_param_t p10; + p10.dst = &dst; + p10.src = tl_ifmap; + p10.mv_lut_base = false; // MUST init by ifself in soc + p10.mv_lut_idx = false; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); +} + +void cvm_get_uint8_t_tbl_idx(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16) { + _cvm_get_tbl_idx(ctx, tl_ifmap, tl_ofmap_bf16, CVK_FMT_U8, 0); +} + +/* + * \brief get bf16 decimal part, cvm_get_dec(12.3) = 12.0 + * it leverages bf16->int8 get integer and move to bf16 + * \tl_ifmap should be CVK_FMT_BF16 format / size + */ +void cvm_get_dec(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* OUT tl_ofmap_bf16) { + ASSERT(tl_ifmap->fmt == CVK_FMT_BF16); + ASSERT(tl_ofmap_bf16->fmt == CVK_FMT_BF16); + + cvk_tdma_l2l_tensor_copy_param_t p10; + cvk_tl_t dst, src; + bmk1880v2_tensor_lmem_s_copy(&src, tl_ifmap); + bmk1880v2_tensor_lmem_s_copy(&dst, tl_buf); + + dst.fmt = CVK_FMT_I8; + dst.stride = ctx->ops->tl_default_stride(ctx, dst.shape, dst.fmt, CTRL_AL); + + // bf16 -> int8 + p10.dst = &dst; + p10.src = &src; + p10.mv_lut_base = false; // MUST init by ifself in soc + p10.mv_lut_idx = false; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); + + // int8 -> bf16 + // src.fmt = CVK_FMT_I8; + // cvk_tl_shape_t tl_ofmap_A_idx_int8_shape = {tl_ofmap_bf16->shape.n, tl_ofmap_bf16->shape.c, + // tl_ofmap_bf16->shape.h * tl_ofmap_bf16->shape.w, 1}; + // src.shape = tl_ofmap_A_idx_int8_shape; + // src.stride = ctx->ops->tl_default_stride(ctx, src.shape, /*eu_align*/ 1, + // src.fmt); src.stride.w = 2; + + // cvk_tl_shape_t tl_dst_reshape = {tl_ofmap_bf16->shape.n, tl_ofmap_bf16->shape.c, + // 1, tl_ofmap_bf16->shape.h * tl_ofmap_bf16->shape.w}; + + p10.dst = tl_ofmap_bf16; + p10.src = &dst; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); +} + +// \return decimal fractions / mantissa_as_idx, +// e.g: cvm_get_dec_fractions(12.341) = 0.341 +// NOTICE: we use bf16->i8, the decimal part should be -127 ~ +127 +void cvm_get_dec_fractions(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* OUT buf, + cvk_tl_t* OUT tl_ofmap_bf16) { + ASSERT(tl_ifmap->fmt == CVK_FMT_BF16); + ASSERT(tl_ofmap_bf16->fmt == CVK_FMT_BF16); + + // idx(i8) to bf16 format to sub it + cvm_get_dec(ctx, tl_ifmap, tl_ofmap_bf16, buf); + + // mantissa part -> s.b.b to get mantissa + cvk_tiu_sub_param_t p5; + p5.res_high = 0; + p5.res_low = tl_ofmap_bf16; + p5.a_high = 0; + p5.a_low = tl_ifmap; + p5.b_high = 0; + p5.b_low = buf; + p5.rshift_bits = 0; + + ctx->ops->tiu_sub(ctx, &p5); +} + +/** + * \table_shape return table shape under 1880v2 BF16 + * \return table byte size under BF16 + */ +uint64_t cvm_lut_tbl_bytesize(cvk_context_t* ctx, cvk_tl_shape_t* table_shape, cvk_fmt_t fmt) { + ASSERT(table_shape); + + int data_type_size = bytesize_of_fmt(fmt); + cvm_table_shape(ctx, table_shape); + uint64_t table_size = tl_shape_size(table_shape); + + return table_size * data_type_size; +} + +/** + * \brief f(x) = x*x + */ +int cvm_emit_square(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16, + cvk_fmt_t fmt) { + (void)fmt; + ASSERT(tl_ifmap->fmt == CVK_FMT_BF16); + ASSERT(tl_ofmap_bf16->fmt == CVK_FMT_BF16); + + cvk_tiu_mul_param_t p1; + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_ifmap; + p1.b_is_const = 0; + p1.b = tl_ifmap; + p1.rshift_bits = 0; + p1.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p1); + + return 0; +} + +/** + * \brief f(x) = |x| + * TODO: check tl_ifmap->start_addr != tl_ofmap_bf16->start_addr + */ +int cvm_emit_abs(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16, + cvk_fmt_t fmt) { + (void)fmt; + ASSERT(tl_ifmap->fmt == tl_ofmap_bf16->fmt && "fmt should be equal"); + + uint32_t b_const = -1; + if (tl_ifmap->fmt) { + b_const = convert_fp32_bf16(-1.0); + } + + // abs it, multiply -1 + cvk_tiu_mul_param_t p1; + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_ifmap; + p1.b_is_const = 1; + p1.b_const.val = b_const; + p1.rshift_bits = 0; + p1.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p1); + + // abs it, get max + cvk_tiu_max_param_t p; + p.max = tl_ofmap_bf16; + p.a = tl_ofmap_bf16; + p.b_is_const = 0; + p.b = tl_ifmap; + + ctx->ops->tiu_max(ctx, &p); + + return 0; +} + +/** + * \brief pythagoras p(x, y) = pow(x*x + y*y, 0.5) + * plz refer [here](http://www.themathpage.com/Alg/pythagorean-distance.htm) + */ +int cvm_emit_pythagoras(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf, + cvk_tl_t* tl_buf2, cvk_tl_t* tl_sqrt_table_answer, + cvk_tl_t* tl_sqrt_table_answer_mantissa, cvk_tl_t* OUT tl_ofmap_bf16, + cvk_fmt_t fmt) { + // y0 = x * x + cvm_emit_square(ctx, x, tl_buf, fmt); + +#if 1 + // y0 = y0 + y * y + cvk_tiu_mac_param_t p2; + p2.res_high = 0; + p2.res_low = tl_buf; + p2.res_is_int8 = 0; + p2.a = y; + p2.b_is_const = 0; + p2.b = y; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + + ctx->ops->tiu_mac(ctx, &p2); +#else + // y * y + cvm_emit_square(ctx, y, tl_buf2, fmt); + // y = x + y + { + cvk_tiu_add_param_t p4; + p4.res_high = 0; + p4.res_low = tl_buf; + p4.a_high = 0; + p4.a_low = tl_buf2; + p4.b_is_const = 0; + p4.b.high = 0; + p4.b.low = tl_buf; + p4.rshift_bits = 0; + p4.relu_enable = 0; + + ctx->ops->tiu_add(ctx, &p4); + } +#endif + + // y0 = sqrt(y0) + cvm_emit_sqrt(ctx, tl_buf, tl_buf2, tl_sqrt_table_answer, tl_sqrt_table_answer_mantissa, + tl_ofmap_bf16); + return 0; +} + +void cvm_gen_0_tbl(uint16_t* OUT table_0, cvk_tl_shape_t* table_shape) { + ASSERT(is_1880v2_tbl_shape(table_shape)); + + uint32_t half = half_h_table(); + int table_hw = cvm_table_hw(); + + table_0[0] = convert_fp32_bf16(1.0); + + for (uint32_t i = 1; i < half * 2; i++) { + table_0[i] = convert_fp32_bf16(0.0); + } + +#ifdef DBG + for (uint32_t i = 0; i < 2 * half; i++) { + printf("lut [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_0[i]), table_0[i]); + } +#endif /* ifdef DBG */ + + // duplicate channel #1 to #31 + // TODO: tensor copy + for (uint64_t i = 1; i < table_shape->c; i++) { + memcpy(&table_0[table_hw * i], &table_0[0], sizeof(uint16_t) * table_hw); + } +} + +/** + * \brief check which element is 0, return 1 others return 0 + * e.g: input = [0, 1, -1, 2] output [1, 0, 0, 0] + */ +int cvm_emit_0_idx(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* tbl_answer, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + // check table / input / output + _bf16_table_check(tl_ifmap, tl_buf, tbl_answer, tbl_answer, tl_ofmap_bf16); + + ASSERT(fmt); + + // TODO: add fmt parameter? + // abs for \bf16_get_uint8_t_tbl_idx we use bf16->uint8_t + // cvm_emit_abs(ctx, tl_ifmap, tl_ofmap_bf16, CVK_FMT_BF16); + // TODO check if address == of address + // cvm_get_uint8_t_tbl_idx(ctx, tl_ofmap_bf16, tl_buf); + // re-scale 0.xx to x. + // cvm_emit_mul_const(ctx, tl_ifmap, tl_buf, fmt, 1000); + + // we directly use mantissa as index, try to add mantissa and mul to filter 2's power + // cvm_emit_abs(ctx, tl_ifmap, tl_buf, fmt); + // cvm_emit_add_const(ctx, tl_buf, tl_buf, fmt, convert_bf16_fp32(0x7f) + 1); + // cvm_emit_mul(ctx, tl_ifmap, tl_buf, tl_buf, fmt); + + cvk_tiu_lookup_table_param_t p12; +#if 1 + // NOTICE: we use int8 mul to enlarge 2^n + cvk_tl_t src, dst; + bmk1880v2_tensor_lmem_s_copy(&src, tl_ifmap); + bmk1880v2_tensor_lmem_s_copy(&dst, tl_buf); + + src.fmt = CVK_FMT_U8; + src.shape.w = src.shape.w * 2; // real size + src.stride = ctx->ops->tl_default_stride(ctx, src.shape, src.fmt, CTRL_NULL); + dst.shape = src.shape; + dst.fmt = src.fmt; + dst.stride = src.stride; + + cvk_tiu_mul_param_t p; + p.res_high = NULL; + p.res_low = &dst; + p.a = &src; + p.b_is_const = 1; + p.b_const.val = 255; // saturate + p.b_const.is_signed = 0; + p.rshift_bits = 2; // avoid unnormal + p.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p); + + // get 2^x and 0 + p12.ofmap = tl_buf; + p12.ifmap = tl_buf; + p12.table = tbl_answer; + ctx->ops->tiu_lookup_table(ctx, &p12); + + // cvm_get_uint8_t_tbl_idx(ctx, tl_buf, tl_ofmap_bf16); + _cvm_get_tbl_idx(ctx, tl_ifmap, tl_ofmap_bf16, CVK_FMT_I8, 0); + + // get 0ops->tiu_lookup_table(ctx, &p12); + + cvm_emit_mul(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); +#else + + _cvm_get_tbl_idx(ctx, tl_ifmap, tl_ofmap_bf16, CVK_FMT_I8, 1); + p12.ofmap = tl_ofmap_bf16; + p12.ifmap = tl_ofmap_bf16; + p12.table = tbl_answer; + ctx->ops->tiu_lookup_table(ctx, &p12); +#endif + return 0; +} + +/** + * \brief max(x, const) + * e.g.: x = [1, 2, 3, 4, -1, -2], const = 1 y = [1, 2, 3, 1, 1] + */ +int cvm_emit_max_const(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16, + cvk_fmt_t fmt, float b) { + (void)fmt; + cvk_tiu_max_param_t p; + p.max = tl_ofmap_bf16; + p.a = tl_ifmap; + p.b_is_const = 1; + p.b_const.val = convert_fp32_bf16(b); + + ctx->ops->tiu_max(ctx, &p); + + return 0; +} + +/** + * \brief min(x, const) + * e.g.: x = [1, 2, 3, 4, -1, -2], const = 1 y = [1, 1, 1, -1, -2] + */ +int cvm_emit_min_const(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16, + cvk_fmt_t fmt, float b) { + (void)fmt; + cvk_tiu_min_param_t p7; + p7.min = tl_ofmap_bf16; + p7.a = tl_ifmap; + p7.b_is_const = 1; + p7.b_const.val = convert_fp32_bf16(b); + p7.b_const.is_signed = 1; + + ctx->ops->tiu_min(ctx, &p7); + + return 0; +} + +// pre process pos/neg +static int _cvm_emit_pre_pos_neg(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* tl_pos_neg_table, cvk_tl_t* OUT tl_ofmap_bf16) { + // check table / input / output + _bf16_table_check(tl_ifmap, tl_buf, tl_pos_neg_table, tl_pos_neg_table, tl_ofmap_bf16); + + //_cvm_get_tbl_idx(ctx, tl_ifmap, tl_buf, CVK_FMT_I8); + + // seperate >=0 and < 0 + // p = table_pos_neg(x0) (>= 0 return 1, < 0 return -1) + cvk_tdma_l2l_tensor_copy_param_t p10; + p10.dst = tl_buf; + p10.src = tl_ifmap; + p10.mv_lut_base = false; // MUST init by ifself in soc + p10.mv_lut_idx = true; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = false; + + cvk_tiu_lookup_table_param_t p12; + p12.ofmap = tl_buf; + p12.ifmap = tl_buf; + p12.table = tl_pos_neg_table; + ctx->ops->tiu_lookup_table(ctx, &p12); + + return 0; +} + +/** + * \brief check elements are < 0 + * \tl_pos_neg_table plz refer \bf16_atan_pos_neg + * e.g: input = [0, 10, 6, -1, 0] output [0, 0, 0, 1, 0] + */ +int cvm_emit_neg_idx(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* tl_pos_neg_table, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + _cvm_emit_pre_pos_neg(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_ofmap_bf16); + + // sub 1, [1 -1] -> [0 -2] + cvm_emit_add_const(ctx, tl_buf, tl_buf, fmt, -1.0); + + // abs, [0 -2] -> [0 2] + cvm_emit_abs(ctx, tl_buf, tl_ofmap_bf16, fmt); + + // mul 1/2 [0 2] -> [0 1] + cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 0.5); + + return 0; +} + +/** + * \brief check elements are >= 0 + * \tl_pos_neg_table plz refer \bf16_atan_pos_neg + * e.g: input = [0, 10, 6, -1, 0] output [0, 1, 1, 0, 0] + */ +int cvm_emit_pos_idx(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* tl_pos_neg_table, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + _cvm_emit_pre_pos_neg(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_ofmap_bf16); + + // add 1, [1 -1] -> [2 0] + cvm_emit_add_const(ctx, tl_buf, tl_buf, fmt, 1.0); + + // mul 1/2 [2 0] -> [1 0] + cvm_emit_mul_const(ctx, tl_buf, tl_ofmap_bf16, fmt, 0.5); + + return 0; +} + +/** + * \brief invert 0/1 input + * e.g: input = [0, 1, 1, 1, 0] output [1, 0, 0, 0, 1] + */ +int _cvm_emit_0_1_revert_input(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, bool is_dirty_ifmap) { + // [-1, -1, 0, -1, 0] = sub([0 0 1 0 1], 1) + // [1, 1, 0, 1, 0] = abs([-1, -1, 0, -1, 0]) + cvk_tl_t* _tl_buf = tl_buf; + + // check buf + if (is_dirty_ifmap) { + _tl_buf = tl_ifmap; + } else { + ASSERT(tl_buf); + ASSERT(tl_buf->fmt == CVK_FMT_BF16); + } + + // sub 1, = add -1 + cvm_emit_add_const(ctx, tl_ifmap, _tl_buf, fmt, -1.0); + + // abs + cvm_emit_abs(ctx, _tl_buf, tl_ofmap_bf16, fmt); + + return 0; +} + +int cvm_emit_0_1_revert_input(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + return _cvm_emit_0_1_revert_input(ctx, tl_ifmap, tl_buf, tl_ofmap_bf16, fmt, false); +} +/** + * \brief invert 0/1 value + * e.g: input = [0, 10, 6, -1, 0] output [1, 0, 0, 0, 1] + * the step is [0, 10, 6, -1, 0] -> [0, 1, 1, 1, 0] -> [1, 0, 0, 0, 1] + */ +int cvm_emit_0_1_revert(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* tbl_answer, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + // [-1, -1, 0, -1, 0] = sub([0 0 1 0 1], 1) + // [1, 1, 0, 1, 0] = abs([-1, -1, 0, -1, 0]) + + // check table / input / output + _bf16_table_check(tl_ifmap, tl_buf, tbl_answer, tbl_answer, tl_ofmap_bf16); + + // check which element is 0, return 1 others return 0 + cvm_emit_0_idx(ctx, tl_ifmap, tl_buf, tbl_answer, tl_ofmap_bf16, fmt); + + cvm_emit_0_1_revert_input(ctx, tl_ofmap_bf16, tl_buf, tl_ofmap_bf16, fmt); + + return 0; +} + +// \brief a(tensor) * b(tensor) +int cvm_emit_mul(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_ifmap2, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + (void)fmt; + cvk_tiu_mul_param_t p1; + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_ifmap; + p1.b_is_const = 0; + p1.b = tl_ifmap2; + p1.rshift_bits = 0; + p1.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p1); + + return 0; +} + +// \brief a(tensor) * b(tensor) +int cvm_emit_add(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_ifmap2, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + (void)fmt; + cvk_tiu_add_param_t p4; + p4.res_high = 0; + p4.res_low = tl_ofmap_bf16; + p4.a_high = 0; + p4.a_low = tl_ifmap; + p4.b_is_const = 0; + p4.b.high = 0; + p4.b.low = tl_ifmap2; + p4.rshift_bits = 0; + p4.relu_enable = 0; + + ctx->ops->tiu_add(ctx, &p4); + + return 0; +} + +int cvm_emit_add_const(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16, + cvk_fmt_t fmt, float b) { + (void)fmt; + cvk_tiu_add_param_t p4; + p4.res_high = 0; + p4.res_low = tl_ofmap_bf16; + p4.a_high = 0; + p4.a_low = tl_ifmap; + p4.b_is_const = 1; + p4.b.high = 0; + p4.b_const.val = convert_fp32_bf16(b); + p4.rshift_bits = 0; + p4.relu_enable = 0; + + ctx->ops->tiu_add(ctx, &p4); + + return 0; +} + +// \brief a(tensor) * b(const) +int cvm_emit_mul_const(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16, + cvk_fmt_t fmt, float b) { + (void)fmt; + uint32_t b_const = (int)b; + if (fmt == CVK_FMT_BF16) { + b_const = convert_fp32_bf16(b); + } + + cvk_tiu_mul_param_t p1; + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_ifmap; + p1.b_is_const = 1; + p1.b_const.val = b_const; + p1.b_const.is_signed = 1; + p1.rshift_bits = 0; + p1.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p1); + + return 0; +} + +// \brief a(tensor) / b(const) +// NOTICE: it could dirty \y if \is_dirty_ifmap set true +int cvm_emit_x_over_y(cvk_context_t* ctx, cvk_tl_t* IN x, cvk_tl_t* IN y, cvk_tl_t* IN tl_buf, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_tl_t* tl_table_answer, + cvk_tl_t* tl_table_answer_mantissa, cvk_fmt_t fmt, bool is_dirty_ifmap) { + cvk_tl_t* tmp = tl_buf; + if (is_dirty_ifmap) { + tmp = NULL; + } + + // y = reciprocal(y) + _cvm_lut_exp_mantissa(ctx, y, tmp, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16, + is_dirty_ifmap); + + // x / y = x * (1/y) + cvm_emit_mul(ctx, x, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + return 0; +} + +int _cvm_emit_mask(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2, + cvk_tl_t* tl_buf3, cvk_tl_t* tl_pos_neg_table, cvk_tl_t* tl_0_idx_table, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, enum CVM_MASK_TYPE mask, + bool is_dirty_ifmap) { + _bf16_table_check(tl_ifmap, tl_buf, tl_pos_neg_table, tl_0_idx_table, tl_ofmap_bf16); + if (is_dirty_ifmap) { + } else { + } + + switch (mask) { + case CVM_MASK_TYPE_GT_0: + // x > 0 + { + // x >= 0 + cvm_emit_pos_idx(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_buf2, fmt); + + cvk_tl_t* out = tl_ofmap_bf16; + cvk_tl_t* in = tl_ofmap_bf16; + if (is_dirty_ifmap) { + // x = 0 + cvm_emit_0_idx(ctx, tl_ifmap, tl_buf, tl_0_idx_table, tl_ofmap_bf16, + fmt); // 0.003 could consider 1 + // !(x = 0) + _cvm_emit_0_1_revert_input(ctx, tl_ofmap_bf16, NULL, tl_buf, fmt, true); + in = tl_buf; + out = tl_ofmap_bf16; + } else { + // x = 0 + cvm_emit_0_idx(ctx, tl_ifmap, tl_buf, tl_0_idx_table, tl_buf3, + fmt); // 0.003 could consider 1 + // !(x = 0) + cvm_emit_0_1_revert_input(ctx, tl_buf3, tl_buf, tl_ofmap_bf16, fmt); + } + + // x > 0 = (x >= 0 && !(x = 0)) + cvm_emit_mul(ctx, in, tl_buf2, out, fmt); + } + break; + case CVM_MASK_TYPE_GE_0: + // y >= 0 + + cvm_emit_pos_idx(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_ofmap_bf16, fmt); + break; + case CVM_MASK_TYPE_EQ_0: + cvm_emit_0_idx(ctx, tl_ifmap, tl_buf, tl_0_idx_table, tl_ofmap_bf16, + fmt); // 0.003 could consider 1 + break; + case CVM_MASK_TYPE_LT_0: + // x < 0 + + // x < 0 + cvm_emit_neg_idx(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_ofmap_bf16, fmt); + + break; + case CVM_MASK_TYPE_LE_0: + // x < 0 + cvm_emit_neg_idx(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_ofmap_bf16, fmt); + + // x = 0 + cvm_emit_0_idx(ctx, tl_ifmap, tl_buf, tl_0_idx_table, tl_buf2, + fmt); // 0.003 could consider 1 + + // x <= 0 = (x < 0 || (x = 0)) + cvm_emit_add(ctx, tl_ofmap_bf16, tl_buf2, tl_ofmap_bf16, fmt); + break; + default: + ASSERT(0 && "not support yet"); + } + return 0; +} + +/** + * \brief return > 0 mask + * e.g.: [1 2 -1 0 -3 -4 ] -> [1 1 0 0 0 0] + */ +int cvm_emit_mask_gt0(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_pos_neg_table, + cvk_tl_t* tl_0_idx_table, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + return cvm_emit_mask(ctx, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_pos_neg_table, tl_0_idx_table, + tl_ofmap_bf16, fmt, CVM_MASK_TYPE_GT_0); +} + +/** + * \brief return >= 0 mask + * e.g.: [1 2 -1 0 -3 -4 ] -> [1 1 0 1 0 0] + */ +int cvm_emit_mask_ge0(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* tl_pos_neg_table, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + return cvm_emit_mask(ctx, tl_ifmap, tl_buf, + tl_buf, // fake + tl_buf, // fake + tl_pos_neg_table, + tl_pos_neg_table, // fake + tl_ofmap_bf16, fmt, CVM_MASK_TYPE_GE_0); +} + +/** + * \brief return <= 0 mask + * e.g.: [1 2 -1 0 -3 -4 ] -> [0 0 1 1 0 0] + */ +int cvm_emit_mask_le0(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* tl_pos_neg_table, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + return cvm_emit_mask(ctx, tl_ifmap, tl_buf, + tl_buf, // fake + tl_buf, // fake + tl_pos_neg_table, + tl_pos_neg_table, // fake + tl_ofmap_bf16, fmt, CVM_MASK_TYPE_LE_0); +} + +/** + * \brief return = 0 mask + * e.g.: [1 2 -1 0 -3 -4 ] -> [0 0 0 1 0 0] + */ +int cvm_emit_mask_eq0(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* tl_0_idx_table, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + return cvm_emit_mask(ctx, tl_ifmap, tl_buf, + tl_buf, // fake + tl_buf, // fake + tl_0_idx_table, // fake + tl_0_idx_table, tl_ofmap_bf16, fmt, CVM_MASK_TYPE_EQ_0); +} + +/** + * \brief return < 0 mask + * e.g.: [1 2 -1 0 -3 -4 ] -> [0 0 1 0 1 1] + */ +int cvm_emit_mask_lt0(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* tl_pos_neg_table, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + return cvm_emit_mask(ctx, tl_ifmap, tl_buf, + tl_buf, // fake + tl_buf, // fake + tl_pos_neg_table, + tl_pos_neg_table, // fake + tl_ofmap_bf16, fmt, CVM_MASK_TYPE_LT_0); +} + +int cvm_emit_mask(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2, + cvk_tl_t* tl_buf3, cvk_tl_t* tl_pos_neg_table, cvk_tl_t* tl_0_idx_table, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, enum CVM_MASK_TYPE mask) { + return _cvm_emit_mask(ctx, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_pos_neg_table, tl_0_idx_table, + tl_ofmap_bf16, fmt, mask, false); +} + +// return x >=0 to 1, x < 0 is -1 +void cvm_emit_mask_ge0_lt0(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* index_i8, + cvk_tl_t* OUT tl_buf3, cvk_fmt_t fmt) { + cvk_tiu_mul_param_t p; + cvk_tdma_l2l_tensor_copy_param_t p1; + + // get y < 0, bf16->int8 and mul 0xff to get -128 and righshift to get 1 + cvm_emit_mul_const(ctx, y, tl_buf3, fmt, pow(2, 64)); + p1.src = tl_buf3; + p1.dst = index_i8; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); + + p.res_high = 0; + p.res_low = index_i8; + p.a = index_i8; + p.b_is_const = 1; + p.b_const.val = -128; + p.b_const.is_signed = 1; + p.rshift_bits = 0; + p.relu_enable = 1; + + ctx->ops->tiu_mul(ctx, &p); + + p.res_high = 0; + p.res_low = index_i8; + p.a = index_i8; + p.b_is_const = 1; + p.b_const.val = 1; + p.b_const.is_signed = 1; + p.rshift_bits = 7; + p.relu_enable = 1; + + ctx->ops->tiu_mul(ctx, &p); + + // get y < 0 indicate 1 + p1.src = index_i8; + p1.dst = tl_buf3; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); + + // merge, y >= 0 is 1, y < 0 is -1 + cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -2.0); + cvm_emit_add_const(ctx, tl_buf3, tl_buf3, fmt, 1.0); + +#if 0 + cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -1.0); + + // get y > 0 + // y * (-1) + 1 get 0/1 map, 1 indicate xy == 0 + cvm_emit_add_const(ctx, tl_buf3, tl_buf2, fmt, 1.0); + + // reduce y == 0 + if (0) + { + cvk_tiu_max_param_t p3; + cvk_tl_t index_i8; + bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, index_i8, tl_ofmap_bf16, CVK_FMT_I8); + cvm_emit_mul_const(ctx, y, tl_buf, fmt, -1); + p3.max = tl_buf; + p3.a = y; + p3.b_is_const = 0; + p3.b = tl_buf; + + ctx->ops->tiu_max(ctx, &p3); + cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, convert_bf16_fp32(0x7f00)); + //bf16_emit_mul_const(ctx, tl_buf, tl_buf, fmt, pow(2, 64)); + + p1.src = tl_buf; + p1.dst = index_i8; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); + + p.res_high = NULL; + p.res_low = index_i8; + p.a = index_i8; + p.b_is_const = 1; + p.b_const.val =-1; + p.b_const.is_signed = 1; + p.rshift_bits = 7; + p.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p); + + + p1.src = index_i8; + p1.dst = tl_buf3; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); + + //revert it + cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -1.0); + //bf16_emit_add_const(ctx, tl_buf3, tl_buf3, fmt, 1); + cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + } + + cvm_emit_add(ctx, tl_buf2, tl_buf3, tl_buf3, fmt); +#endif +} + +/* + * \return -1 means others, 0 indicate 0 + */ +void cvm_emit_mask_eq_0(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* tl_buf, cvk_tl_t* index_i8, + cvk_tl_t* OUT tl_buf3, cvk_fmt_t fmt) { + cvk_tdma_l2l_tensor_copy_param_t p1; + cvk_tiu_mul_param_t p; + + cvm_emit_abs(ctx, y, tl_buf, fmt); + // cvm_emit_mul_const(ctx, y, tl_buf, fmt, -1); + // cvk_tiu_max_param_t p3; + // p3.max = tl_buf; + // p3.a = y; + // p3.b_is_const = 0; + // p3.b = tl_buf; + + // ctx->ops->tiu_max(ctx, &p3); + cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, convert_bf16_fp32(0x7f00)); + + p1.src = tl_buf; + p1.dst = index_i8; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); + + p.res_high = NULL; + p.res_low = index_i8; + p.a = index_i8; + p.b_is_const = 1; + p.b_const.val = -1; + p.b_const.is_signed = 1; + p.rshift_bits = 7; + p.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p); + + p1.src = index_i8; + p1.dst = tl_buf3; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); +} + +int cvm_bytesize_of_fmt(cvk_fmt_t fmt) { return bytesize_of_fmt(fmt); } + +// dirty itself +int cvm_reduce_hw_mul(cvk_context_t* cvk_ctx, cvk_tl_t* mp_tl_mulsum) { + cvk_tl_shape_t m_tl_mulsum_shape = mp_tl_mulsum->shape; + uint32_t total_data_size = m_tl_mulsum_shape.h * m_tl_mulsum_shape.w; + uint32_t data_size = total_data_size; + uint32_t fmt_size = cvm_bytesize_of_fmt(mp_tl_mulsum->fmt); + cvk_tiu_mul_param_t p_mul; + cvk_tl_t tl_1; + cvk_tl_t tl_2; + tl_1.fmt = mp_tl_mulsum->fmt; + tl_2.fmt = mp_tl_mulsum->fmt; + while (data_size > 1) { + uint32_t start_addr = mp_tl_mulsum->start_address; + bool add_1 = false; + if (data_size % 2 != 0) { + add_1 = true; + data_size -= 1; + start_addr += fmt_size; + } + data_size /= 2; + uint32_t w = data_size; + uint32_t h = 1; + size_t m = w / 2; + for (size_t i = 2; i < m; i++) { + if (data_size % i == 0) { + w = data_size / i; + h = i; + if (w < 4063) { + break; + } + } + } + tl_1.start_address = start_addr; + tl_2.start_address = start_addr + (h * w * fmt_size); + tl_1.shape.n = 1; + tl_1.shape.c = m_tl_mulsum_shape.c; + tl_1.shape.h = h; + tl_1.shape.w = w; + tl_1.stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_1.shape, tl_1.fmt, 1); + tl_2.shape = tl_1.shape; + tl_2.stride = tl_1.stride; + p_mul.a = &tl_1; + p_mul.b = &tl_2; + p_mul.res_low = &tl_1; + p_mul.res_high = NULL; + p_mul.b_is_const = 0; + p_mul.rshift_bits = 0; + p_mul.relu_enable = 0; + cvk_ctx->ops->tiu_mul(cvk_ctx, &p_mul); + if (add_1) { + data_size += 1; + } + } + return 0; +} diff --git a/cvimath/src/fp32_bf16_kernel.c b/cvimath/src/fp32_bf16_kernel.c new file mode 100644 index 000000000..38e305336 --- /dev/null +++ b/cvimath/src/fp32_bf16_kernel.c @@ -0,0 +1,138 @@ +#include +#include "gen_lut.h" + +// only fill base_reg_index/int8_rnd_mode +static void init_tgmem(cvk_tg_t* t) { + t->base_reg_index = 0; + t->int8_rnd_mode = 0; +} + +int cvm_s2s_fp32_bf16(cvk_context_t* ctx, uint64_t gaddr_fp32, cvk_tg_shape_t fp32_shape, + uint64_t gaddr_bf16, cvk_tg_shape_t cvm_shape, cvk_fmt_t fmt) { + int ret = 0; + ASSERT(fmt == CVK_FMT_BF16 && "only support CVK_FMT_BF16"); + ASSERT(fp32_shape.w % 2 == 0 && "fp32's w MUST align with 2"); + + cvk_tdma_g2g_tensor_copy_param_t p; + + cvk_tg_t src, dst; + + init_tgmem(&src); + init_tgmem(&dst); + + int fp32_w = 2; + src.fmt = fmt; + src.start_address = gaddr_fp32 + fp32_w; // copy from high part + src.shape = fp32_shape; + src.shape.h = fp32_shape.w * fp32_shape.h / fp32_w; + src.shape.w = 1; + + int fmt_sz = bytesize_of_fmt(fmt); + src.stride.n = fp32_shape.w * fp32_shape.h * fp32_shape.c * fmt_sz; + src.stride.c = fp32_shape.w * fp32_shape.h * fmt_sz; + src.stride.h = fp32_w * fmt_sz; + + dst.fmt = fmt; + dst.start_address = gaddr_bf16; + dst.shape = cvm_shape; + dst.shape.h = cvm_shape.w * cvm_shape.h / fp32_w; + dst.shape.w = 1; + dst.stride = ctx->ops->tg_default_stride(ctx, dst.shape, dst.fmt); + + p.src = &src; + p.dst = &dst; + + ctx->ops->tdma_g2g_bf16_tensor_copy(ctx, &p); + + return ret; +} + +// default implement by s->s +void cvm_bf16_fp32(cvk_context_t* cvk_ctx, cvk_tg_t* tg_bf16, cvk_tg_t* tg_fp32) { +#if 0 + // sys->local->sys implement + cvk_fmt_t fmt = tg_bf16->fmt; + cvk_tl_shape_t tl_shape; + int ctrl = CTRL_AL; // eu align + + tl_shape.n = tg_fp32->shape.n; + tl_shape.c = tg_fp32->shape.c; + tl_shape.h = tg_fp32->shape.h; + tl_shape.w = tg_fp32->shape.w; + + // 1. fill local memory to 0 for mantissa + cvk_tl_t *tl_ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, tg_bf16->fmt, ctrl); + cvk_tiu_mul_param_t p0; + p0.res_high = NULL; + p0.res_low = tl_ofmap; + p0.a = tl_ofmap; + p0.b_is_const = 1; + p0.b_const.val = 0; + p0.b_const.is_signed = 0; + p0.rshift_bits = 0; + p0.relu_enable = 0; + p0.layer_id = 0; + + cvk_ctx->ops->tiu_mul(cvk_ctx, &p0); + + + // pretend the same shape, reshape h, w to h * w, 1 + int fmt_bytesize = cvm_bytesize_of_fmt(tl_ofmap->fmt); + tl_ofmap->shape.w = 1; + tl_ofmap->shape.h = tg_bf16->shape.h * tg_bf16->shape.w; + tl_ofmap->stride.h = 4; + tl_ofmap->stride.c = align_up(tg_fp32->shape.w * tg_fp32->shape.h * fmt_bytesize, + cvk_ctx->info.eu_num); + tl_ofmap->stride.n = tl_ofmap->stride.c * ceiling_func(tg_fp32->shape.c, + cvk_ctx->info.npu_num); + + + // 2. load from tg with reshaped w + // FIXME: check overwrite + tl_ofmap->start_address = tl_ofmap->start_address + 2;// 2 means shift fp32 high 16 part + cvk_tdma_g2l_tensor_copy_param_t p; + p.src = tg_bf16; + p.dst = tl_ofmap; + cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &p); + + // 3. store back to tg + tl_ofmap->start_address = tl_ofmap->start_address - 2; //revert + tl_ofmap->shape = tl_shape; + tl_ofmap->stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_ofmap->shape, fmt, ctrl); + + cvk_tdma_l2g_tensor_copy_param_t p1; + p1.src = tl_ofmap; + p1.dst = tg_fp32; + cvk_ctx->ops->tdma_l2g_bf16_tensor_copy(cvk_ctx, &p1); + + cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ofmap); +#else + // sys->sys implement + // 1. fill tg with low 16but part as 0 + cvk_tdma_l2g_tensor_fill_constant_param_t p0; + p0.constant = 0; + p0.dst = tg_fp32; + p0.layer_id = 0; + cvk_ctx->ops->tdma_l2g_tensor_fill_constant(cvk_ctx, &p0); + + // 2. sys->sys + cvk_tdma_g2g_tensor_copy_param_t p1; + cvk_tg_shape_t shape = tg_fp32->shape; // backup + cvk_tg_stride_t stride = tg_fp32->stride; + + tg_fp32->shape.w = 1; + tg_fp32->shape.h = tg_bf16->shape.h * tg_bf16->shape.w; + tg_fp32->stride.h = 4; + + tg_fp32->start_address = tg_fp32->start_address + 2; // +2 means shift from high part + p1.src = tg_bf16; + p1.dst = tg_fp32; + p1.layer_id = 0; + cvk_ctx->ops->tdma_g2g_bf16_tensor_copy(cvk_ctx, &p1); + + // restore + tg_fp32->start_address = tg_fp32->start_address - 2; + tg_fp32->shape = shape; + tg_fp32->stride = stride; +#endif +} diff --git a/cvimath/src/gen_lut.h b/cvimath/src/gen_lut.h new file mode 100644 index 000000000..8c58a0c8c --- /dev/null +++ b/cvimath/src/gen_lut.h @@ -0,0 +1,207 @@ +#ifndef GEN_LUT_1880v2_H +#define GEN_LUT_1880v2_H + +#include +#include +#include + +#define IN +#define OUT +#define ASSERT(x) assert(x) + +static inline int cvm_exp_start() { return -62; } +static inline int cvm_exp_end() { return 63; } +static inline int cvm_table_h() { return 32; } +static inline int cvm_table_w() { return 8; } +static inline int cvm_table_hw() { return cvm_table_h() * cvm_table_w(); } +static inline int half_h_table() { return cvm_table_h() * cvm_table_w() / 2; } +static inline bool is_1880v2_tbl_shape(cvk_tl_shape_t *s) { + // FIXME: h could be reduce less than 32 + assert(s->h == (uint32_t)cvm_table_h() && s->w == (uint32_t)cvm_table_w() && + "table h/w should be 32/8"); + + return s->h == (uint32_t)cvm_table_h() && s->w == (uint32_t)cvm_table_w(); +} + +// copy cvk_tl_t structure +static inline void bmk1880v2_tensor_lmem_s_copy(cvk_tl_t *dst, cvk_tl_t *src) { + dst->start_address = src->start_address; + dst->fmt = src->fmt; + dst->shape = src->shape; + dst->stride = src->stride; + dst->int8_rnd_mode = src->int8_rnd_mode; +} + +static inline void bmk1880v2_tensor_lmem_s_copy_bf16_8(cvk_context_t *ctx, cvk_tl_t *dst, + cvk_tl_t *src, cvk_fmt_t fmt) { + assert(src->fmt == CVK_FMT_BF16 && (fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8) && + "only support bf16->i8/uint8_t, plz check fmt\n"); + + dst->start_address = src->start_address; + dst->fmt = fmt; + dst->shape = src->shape; + dst->shape.w *= 2; + dst->stride = ctx->ops->tl_default_stride(ctx, dst->shape, fmt, CTRL_NULL); + // dst->shape.h *= 2; + // dst->stride = ctx->ops->tl_default_stride(ctx, dst->shape, + // /*eu_align*/ 1, + // fmt); + // dst->shape.h = src->shape.h; + dst->int8_rnd_mode = src->int8_rnd_mode; +} + +// l2l means we keep the same shape between bf16/(u)int8 +static inline void bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(cvk_context_t *ctx, cvk_tl_t *dst, + cvk_tl_t *src, cvk_fmt_t fmt) { + assert(src->fmt == CVK_FMT_BF16 && (fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8) && + "only support bf16->i8/uint8_t, plz check fmt\n"); + + dst->start_address = src->start_address; + dst->fmt = fmt; + dst->shape = src->shape; + dst->stride = ctx->ops->tl_default_stride(ctx, dst->shape, fmt, CTRL_NULL); + dst->int8_rnd_mode = src->int8_rnd_mode; +} + +int cvm_emit_square(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *OUT tl_ofmap_bf16, + cvk_fmt_t fmt); + +void cvm_table_check(cvk_tl_t *IN tl_ifmap, cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa, + cvk_tl_t *OUT tl_ofmap_bf16); + +int cvm_lut_exp_mantissa(cvk_context_t *ctx, cvk_tl_t *IN tl_ifmap, cvk_tl_t *IN tl_buf, + cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa, + cvk_tl_t *OUT tl_ofmap_bf16); + +void cvm_get_uint8_t_tbl_idx(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *OUT tl_ofmap_bf16); + +void cvm_get_dec(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *OUT tl_ofmap_bf16); + +void cvm_get_dec_fractions(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *OUT buf, + cvk_tl_t *OUT tl_ofmap_bf16); + +int cvm_emit_abs(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *OUT tl_ofmap_bf16, + cvk_fmt_t fmt); + +int _cvm_lut_exp_mantissa(cvk_context_t *ctx, cvk_tl_t *IN tl_ifmap, cvk_tl_t *IN tl_buf, + cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa, + cvk_tl_t *OUT tl_ofmap_bf16, bool is_dirty_ifmap); + +int _cvm_atan_fast_emit(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, + cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf, + cvk_tl_t *tl_table_answer, cvk_tl_t *tl_table_answer_mantissa, + cvk_tl_t *OUT tl_ofmap_bf16, cvk_fmt_t fmt, float b, bool is_dirty_ifmap); + +int cvm_emit_x_over_y(cvk_context_t *ctx, cvk_tl_t *IN x, cvk_tl_t *IN y, cvk_tl_t *IN tl_buf, + cvk_tl_t *OUT tl_ofmap_bf16, cvk_tl_t *tl_table_answer, + cvk_tl_t *tl_table_answer_mantissa, cvk_fmt_t fmt, bool is_dirty_ifmap); + +int _cvm_emit_mask(cvk_context_t *ctx, cvk_tl_t *IN tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, + cvk_tl_t *tl_buf3, cvk_tl_t *tl_pos_neg_table, cvk_tl_t *tl_0_idx_table, + cvk_tl_t *OUT tl_ofmap_bf16, cvk_fmt_t fmt, enum CVM_MASK_TYPE mask, + bool is_dirty_ifmap); + +void _cvm_get_tbl_idx(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *OUT tl_ofmap_bf16, + cvk_fmt_t src_fmt, int int8_rnd_mode); +int __cvm_atan_fast_emit(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tl_buf2, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_invert_buf, + cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer, + cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *OUT tl_ofmap_bf16, + cvk_fmt_t fmt); + +// not need to export to user +// mask please refer \CVM_MASK_TYPE for supported case +int cvm_emit_mask_gt0(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_pos_neg_buf, + cvk_tl_t *tl_0_idx_buf, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); + +int cvm_emit_mask_ge0(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tl_pos_neg_table, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); + +int cvm_emit_mask_le0(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tl_pos_neg_table, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); + +int cvm_emit_mask_eq0(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tl_0_idx_table, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); + +int cvm_emit_mask_lt0(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tl_pos_neg_table, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); + +int _cvm_atan_emit(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, + cvk_tl_t *tl_buf3, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_slope_buf, + cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer, + cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt, + float b); + +void cvm_emit_mask_ge0_lt0(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *index_i8, + cvk_tl_t *tl_buf3, cvk_fmt_t fmt); + +void cvm_emit_mask_eq_0(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *tl_buf, cvk_tl_t *index_i8, + cvk_tl_t *tl_buf3, cvk_fmt_t fmt); + +int cvm_lut_exp_mantissa(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa, + cvk_tl_t *tl_ofmap_bf16); + +int cvm_emit_pythagoras(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf, + cvk_tl_t *tl_buf2, cvk_tl_t *tl_sqrt_table_answer, + cvk_tl_t *tl_sqrt_table_answer_mantissa, cvk_tl_t *tl_ofmap_bf16, + cvk_fmt_t fmt); + +int cvm_emit_max_const(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ofmap_bf16, + cvk_fmt_t fmt, float b); + +int cvm_emit_min_const(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ofmap_bf16, + cvk_fmt_t fmt, float b); + +int cvm_emit_0_1_revert(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tbl_answer, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); + +int cvm_emit_mul(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2, + cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); + +int cvm_emit_add(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2, + cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); + +int cvm_emit_add_const(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ofmap_bf16, + cvk_fmt_t fmt, float b); + +int cvm_emit_mul_const(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ofmap_bf16, + cvk_fmt_t fmt, float b); +// not release yet + +void cvm_atan2_emit(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf, + cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_buf4, cvk_tl_t *tl_buf5, + cvk_tl_t *tl_buf6, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_slope_buf, + cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer, + cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_sqrt_table_answer, + cvk_tl_t *tl_sqrt_table_answer_mantissa, cvk_tl_t *tl_0_idx_table, + cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); + +int cvm_atan_slope_multipilier(cvk_context_t *cvk_ctx, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, + cvk_tl_t *tl_buf3, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ofmap_bf16, + cvk_fmt_t fmt); + +int cvm_atan_fast_emit(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, + cvk_tl_t *tl_buf2, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_invert_buf, + cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer, + cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt, + bool is_dirty_ifmap); + +void cvm_atan2_fast_emit(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf, + cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_buf4, + cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_slope_buf, cvk_tl_t *tl_invert_buf, + cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer, + cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_0_idx_table, + cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt); + +// conv used +int cvm_reshape_channel_same_pad( + cvk_context_t *cvk_ctx, int ic, int ih, int iw, int kh, int kw, int pad_right, int pad_left, + int stride_h, int stride_w, cvk_tl_shape_t *tl_load_shape, cvk_tl_stride_t *new_tl_ifmap_stride, + cvk_tg_shape_t *new_tg_ifmap_shape, cvk_tg_stride_t *new_tg_ifmap_stride, + cvk_tl_shape_t *new_tl_weight_shape, cvk_tl_shape_t *new_tl_bias_shape, + cvk_tl_shape_t *new_tl_ofmap_shape, cvk_fmt_t fmt, int eu_align); + +#endif /* GEN_LUT_1880v2_H */ diff --git a/cvimath/src/set_val_by_mask.c b/cvimath/src/set_val_by_mask.c new file mode 100644 index 000000000..33c976a79 --- /dev/null +++ b/cvimath/src/set_val_by_mask.c @@ -0,0 +1,1169 @@ +#include +#include "gen_lut.h" + +static inline int check_u8(cvk_tl_t* a, cvk_tl_t* b, cvk_tl_t* c) { + return (a->fmt == CVK_FMT_U8 && b->fmt == CVK_FMT_U8 && c->fmt == CVK_FMT_U8); +} + +static inline int check_same_fmt(cvk_tl_t* a, cvk_tl_t* b, cvk_tl_t* c) { + return a->fmt == b->fmt && b->fmt == c->fmt; +} + +int cvm_set_image_by_u8mask(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* tl_mask, cvk_tl_t* tl_ofmap) { + int ret = 0; + cvk_fmt_t fmt = tl_ifmap->fmt; + if (!check_u8(tl_ifmap, tl_buf, tl_ofmap)) { + // throw config error + printf("input/buf/ofmap format should config CVK_FMT_U8\n"); + return -1; + } + + if (!check_same_fmt(tl_ifmap, tl_buf, tl_mask) && !check_same_fmt(tl_ifmap, tl_buf, tl_ofmap)) { + printf("all tensor's fmt shoud be equal\n"); + return -2; + } + + cvk_tl_t* high = tl_buf; + if (tl_ofmap->fmt == CVK_FMT_BF16) { + // TODO: support it + high = NULL; + } else if (tl_ofmap->fmt == CVK_FMT_U8) { + // hw limitation that input should be i8 + tl_ofmap->fmt = tl_buf->fmt = tl_mask->fmt = CVK_FMT_I8; + cvm_emit_mul_const(ctx, high, high, high->fmt, 0); + } else { + printf("not support fmt\n"); + return -3; + } + + // revert mask to set selected one as 0 + cvm_emit_mul_const(ctx, tl_mask, tl_mask, tl_mask->fmt, -1); + + // set mask selected one as 0 + // e.g: -1 - (-1) for this cast that turn to -1 * -1 + 255(0xff) = 256, get low part as 0 + cvk_tiu_mac_param_t p2; + p2.res_high = high; + p2.res_low = tl_ofmap; + p2.res_is_int8 = 0; // keep origin + p2.a = tl_ofmap; + p2.b_is_const = 0; + p2.b = tl_mask; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + + ctx->ops->tiu_mac(ctx, &p2); + + // revert back + cvm_emit_mul_const(ctx, tl_mask, tl_mask, tl_mask->fmt, -1); + + // overwrite selected one + p2.res_high = high; + p2.res_low = tl_ofmap; + p2.res_is_int8 = 0; // keep origin + p2.a = tl_ifmap; + p2.b_is_const = 0; + p2.b = tl_mask; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + + ctx->ops->tiu_mac(ctx, &p2); + + // restore + tl_ifmap->fmt = tl_buf->fmt = tl_mask->fmt = tl_ofmap->fmt = fmt; + + return ret; +} + +// dp means depthwise version +int cvm_set_image_by_u8mask_dp(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_mask, + cvk_tl_t* tl_kernel, cvk_tl_t* tl_bias, cvk_tl_t* tl_ofmap) { + int ret = 0; + cvk_fmt_t fmt = tl_ifmap->fmt; + if (!check_u8(tl_ifmap, tl_ifmap, tl_ofmap)) { + // throw config error + printf("input/buf/ofmap format should config CVK_FMT_U8\n"); + return -1; + } + + if (!check_same_fmt(tl_ifmap, tl_ifmap, tl_mask) && + !check_same_fmt(tl_ifmap, tl_ifmap, tl_ofmap)) { + printf("all tensor's fmt shoud be equal\n"); + return -2; + } + + if (tl_ofmap->fmt == CVK_FMT_BF16) { + // TODO: support it + } else if (tl_ofmap->fmt == CVK_FMT_U8) { + // hw limitation that input should be i8 + tl_ifmap->fmt = tl_ofmap->fmt = tl_mask->fmt = CVK_FMT_I8; + } else { + printf("not support fmt\n"); + return -3; + } + + // mask 1 means overwrite new one, 0 means keep old + // if = if * mask + // mask = depthwise(mask) * kernel_1x1 + bias, kernel set to -1, bias set to 1 + // of = of * mask + // mask = mask * 0 // reset high part + // mask_of = of + 1 * if + + cvm_emit_mul(ctx, tl_ifmap, tl_mask, tl_ifmap, tl_mask->fmt); + + // revert 0/1 to 1/0 + cvk_tiu_depthwise_pt_convolution_param_t param; + param.ofmap = tl_mask; + param.ifmap = tl_mask; + param.weight = tl_kernel; + param.bias = tl_bias; + param.ins_h = 0; + param.ins_last_h = 0; + param.ins_w = 0; + param.ins_last_w = 0; + param.stride_h = 1; + param.stride_w = 1; + param.dilation_h = 1; + param.dilation_w = 1; + param.pad_top = 0; + param.pad_bottom = 0; + param.pad_left = 0; + param.pad_right = 0; + param.relu_enable = 0; + param.rshift_bits = 0; + param.ins_val = 0; // symmetric quantization + param.ins_fp = 0; // symmetric quantization + ctx->ops->tiu_pt_depthwise_convolution(ctx, ¶m); + + // keep of + cvm_emit_mul(ctx, tl_ofmap, tl_mask, tl_ofmap, tl_mask->fmt); + + // reset high part + cvm_emit_mul_const(ctx, tl_mask, tl_mask, tl_mask->fmt, 0); + + cvk_tiu_mac_param_t p2; + p2.res_high = tl_mask; + p2.res_low = tl_ofmap; + p2.res_is_int8 = 0; // keep origin + p2.a = tl_ifmap; + p2.b_is_const = 1; + p2.b_const.val = 1; + p2.b_const.is_signed = 1; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + + ctx->ops->tiu_mac(ctx, &p2); + + // restore + tl_ifmap->fmt = tl_mask->fmt = tl_ofmap->fmt = fmt; + + return ret; +} + +// \is_less = 1 means that 1 indicate less and 0 is greater equal \threshold +// \is_less = 0 means that 1 indicate greater equal \threshold and 0 indicate less +static void __get_less_large_mask(cvk_context_t* ctx, cvk_tl_t* buf, cvk_tl_t* buf2, + cvk_tl_t* tl_update_tbl, uint8_t threshold, bool is_less) { + bool is_signed = buf->fmt == CVK_FMT_I8; + + // keep tl_update_tbl < threshold + // mul to hoist int16 and add it with sign bit + // TODO: try not use high part + cvk_tiu_mul_param_t p1 = {0}; + p1.res_high = buf2; + p1.res_low = buf; + p1.a = tl_update_tbl; + p1.b_const.val = 1; + p1.b_const.is_signed = is_signed; + p1.b_is_const = true; + p1.rshift_bits = 0; + p1.layer_id = 0; + p1.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p1); + + // just check sign bit for > thres or not + // i16 diff + cvk_fmt_t fmt = buf->fmt; + buf2->fmt = buf->fmt = CVK_FMT_I8; + is_signed = true; + + // e.g: 10 - 6 = 4, 2 - 6 = -4 + cvk_tiu_add_param_t p4; + p4.res_high = 0; // saturatue to int8 + p4.res_low = buf; + p4.a_high = buf2; + p4.a_low = buf; + p4.b_is_const = 1; + p4.b_const.val = -1 * (threshold); + p4.b_const.is_signed = is_signed; + p4.rshift_bits = 0; + p4.relu_enable = 0; + ctx->ops->tiu_add(ctx, &p4); + + // saturate to int max + // 4 * -127 = -128, -4 * -127 = 127 + // hex represent is 0x80, 0x7F + cvk_tiu_mul_param_t p; + p.res_high = 0; + p.res_low = buf; + p.a = buf; + p.b_is_const = 1; + p.b_const.val = -127; // revert to > 0 + if (!is_less) { + p.b_const.val = 127; + } + p.b_const.is_signed = is_signed; + p.rshift_bits = 0; + p.relu_enable = is_signed; + ctx->ops->tiu_mul(ctx, &p); + + // set as mask(127->1) + // hex represent is 0x80, 0x7F, right shift 7 + // 0x1, 0x0 + p.res_high = 0; + p.res_low = buf; + p.a = buf; + p.b_is_const = 1; + p.b_const.val = 1; + p.b_const.is_signed = is_signed; + p.rshift_bits = 7; + p.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p); + + // revert + buf2->fmt = buf->fmt = fmt; + return; +} + +// \is_less = 1 means that 1 indicate less and 0 is greater equal \threshold +// \is_less = 0 means that 1 indicate greater equal \threshold and 0 indicate less +static void __get_less_large_mask_dp(cvk_context_t* ctx, cvk_tl_t* tl_update_tbl, + cvk_tl_t* tl_kernel, cvk_tl_t* tl_threshold, bool is_less) { + bool is_signed = tl_update_tbl->fmt == CVK_FMT_I8; + + // 1. depthwise(tl_update_tbl) * kernel(1) + bias(-1 * threshold) + // 2. mul to saturate it + // 3. right shift + + cvk_tiu_depthwise_pt_convolution_param_t param; + param.ofmap = tl_update_tbl; + param.ifmap = tl_update_tbl; + param.weight = tl_kernel; + param.bias = tl_threshold; + param.ins_h = 0; + param.ins_last_h = 0; + param.ins_w = 0; + param.ins_last_w = 0; + param.stride_h = 1; + param.stride_w = 1; + param.dilation_h = 1; + param.dilation_w = 1; + param.pad_top = 0; + param.pad_bottom = 0; + param.pad_left = 0; + param.pad_right = 0; + param.relu_enable = 0; + param.rshift_bits = 0; + param.ins_val = 0; // symmetric quantization + param.ins_fp = 0; // symmetric quantization + ctx->ops->tiu_pt_depthwise_convolution(ctx, ¶m); + + cvk_fmt_t fmt = tl_update_tbl->fmt; + tl_update_tbl->fmt = CVK_FMT_I8; + is_signed = true; + + // saturate to int max + // 4 * -127 = -128, -4 * -127 = 127 + // hex represent is 0x80, 0x7F + cvk_tiu_mul_param_t p; + p.res_high = 0; + p.res_low = tl_update_tbl; + p.a = tl_update_tbl; + p.b_is_const = 1; + p.b_const.val = -127; // revert to > 0 + if (!is_less) { + p.b_const.val = 127; + } + p.b_const.is_signed = is_signed; + p.rshift_bits = 0; + p.relu_enable = is_signed; + ctx->ops->tiu_mul(ctx, &p); + + // set as mask(127->1) + // hex represent is 0x80, 0x7F, right shift 7 + // 0x1, 0x0 + p.res_high = 0; + p.res_low = tl_update_tbl; + p.a = tl_update_tbl; + p.b_is_const = 1; + p.b_const.val = 1; + p.b_const.is_signed = is_signed; + p.rshift_bits = 7; + p.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p); + + // revert + tl_update_tbl->fmt = fmt; + return; +} +/** + * \high as output + */ +static void _get_less_large_mask(cvk_context_t* ctx, cvk_tl_t* buf, cvk_tl_t* buf2, + cvk_tl_t* tl_update_tbl, uint8_t threshold, bool is_less) { + bool is_signed = buf->fmt == CVK_FMT_I8; + // keep tl_update_tbl < threshold + // mul to hoist int16 and add it with sign bit + // TODO: try not use high part + cvk_tiu_mul_param_t p1 = {0}; + p1.res_high = buf2; + p1.res_low = buf; + p1.a = tl_update_tbl; + p1.b_const.val = 1; + p1.b_const.is_signed = is_signed; + p1.b_is_const = true; + p1.rshift_bits = 0; + p1.layer_id = 0; + p1.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p1); + + // just check sign bit for > thres or not + // i16 diff + cvk_fmt_t fmt = buf->fmt; + buf2->fmt = buf->fmt = CVK_FMT_I8; + is_signed = true; + + cvk_tiu_add_param_t p4; + p4.res_high = 0; // saturatue to int8 + p4.res_low = buf; + p4.a_high = buf2; + p4.a_low = buf; + p4.b_is_const = 1; + p4.b_const.val = -1 * (threshold); + p4.b_const.is_signed = is_signed; + p4.rshift_bits = 0; + p4.relu_enable = 0; + ctx->ops->tiu_add(ctx, &p4); + + // saturate to int max + cvk_tiu_mul_param_t p; + p.res_high = 0; + p.res_low = buf; + p.a = buf; + p.b_is_const = 1; + p.b_const.val = -127; // revert to > 0 + if (!is_less) { + p.b_const.val = 127; + } + p.b_const.is_signed = is_signed; + p.rshift_bits = 0; + p.relu_enable = is_signed; + ctx->ops->tiu_mul(ctx, &p); + + // set as mask(127->1) + p.res_high = 0; + p.res_low = buf; + p.a = buf; + p.b_is_const = 1; + p.b_const.val = 1; + p.b_const.is_signed = is_signed; + p.rshift_bits = 7; + p.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p); + + // revert + buf2->fmt = buf->fmt = fmt; + return; +} + +static void _get_less_mask(cvk_context_t* ctx, cvk_tl_t* buf, cvk_tl_t* buf2, + cvk_tl_t* tl_update_tbl, uint8_t threshold) { + _get_less_large_mask(ctx, buf, buf2, tl_update_tbl, threshold, /*is_less=*/1); +} + +static void _get_large_mask(cvk_context_t* ctx, cvk_tl_t* buf, cvk_tl_t* buf2, + cvk_tl_t* tl_update_tbl, uint8_t threshold) { + _get_less_large_mask(ctx, buf, buf2, tl_update_tbl, threshold, /*is_less=*/0); +} + +int cvm_set_image_by_two_info_i8(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf2, + cvk_tl_t* tl_mask, cvk_tl_t* tl_update_tbl, uint8_t threshold, + cvk_tl_t* tl_ofmap) { + cvk_fmt_t fmt = tl_ifmap->fmt; + if (!check_u8(tl_ifmap, tl_buf2, tl_update_tbl)) { + // throw config error + printf("input/buf/tl_update_tbl format should config CVK_FMT_U8\n"); + return -1; + } + + if (!check_same_fmt(tl_ifmap, tl_buf2, tl_mask)) { + printf("all tensor's fmt shoud be equal\n"); + return -2; + } + + if (tl_ofmap->fmt == CVK_FMT_BF16) { + } else if (tl_ofmap->fmt == CVK_FMT_U8) { + // hw limitation that input should be i8 + // tl_update_tbl->fmt = tl_buf->fmt = tl_mask->fmt = tl_ofmap->fmt = CVK_FMT_I8; + tl_buf2->fmt = tl_update_tbl->fmt = tl_ofmap->fmt = tl_mask->fmt = CVK_FMT_I8; + } else { + printf("not support fmt\n"); + return -3; + } + + __get_less_large_mask(ctx, tl_update_tbl, tl_buf2, tl_update_tbl, threshold, 1); + + // set new mask + cvm_emit_mul(ctx, tl_mask, tl_update_tbl, tl_mask, tl_mask->fmt); + + // restore + tl_buf2->fmt = tl_update_tbl->fmt = tl_mask->fmt = tl_ofmap->fmt = fmt; + + return cvm_set_image_by_u8mask(ctx, tl_ifmap, tl_buf2, tl_mask, tl_ofmap); +} + +int cvm_set_image_by_two_info_i8_dp(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_kernel, + cvk_tl_t* tl_mask, cvk_tl_t* tl_update_tbl, + cvk_tl_t* tl_threshold, cvk_tl_t* tl_ofmap) { + cvk_fmt_t fmt = tl_ifmap->fmt; + if (!check_u8(tl_ifmap, tl_ofmap, tl_update_tbl)) { + // throw config error + printf("input/buf/tl_update_tbl format should config CVK_FMT_U8\n"); + return -1; + } + + if (!check_same_fmt(tl_ifmap, tl_ofmap, tl_mask)) { + printf("all tensor's fmt shoud be equal\n"); + return -2; + } + + if (tl_update_tbl->shape.h <= 1) { + printf("tl_update_tbl will be as bias high part, the high should be >= 2\n"); + return -3; + } + + if (tl_ofmap->fmt == CVK_FMT_BF16) { + } else if (tl_ofmap->fmt == CVK_FMT_U8) { + // hw limitation that input should be i8 + // tl_update_tbl->fmt = tl_buf->fmt = tl_mask->fmt = tl_ofmap->fmt = CVK_FMT_I8; + tl_update_tbl->fmt = tl_mask->fmt = CVK_FMT_I8; + } else { + printf("not support fmt\n"); + return -3; + } + + __get_less_large_mask_dp(ctx, tl_update_tbl, tl_kernel, tl_threshold, 1); + + // set new mask + cvm_emit_mul(ctx, tl_mask, tl_update_tbl, tl_mask, tl_mask->fmt); + + // dirty bias set to 1 + // tl_bias = tl_bias * 0 + // tl_update_tbl = tl_update_tbl * 0 + // tl_update-tbl_tl_bias = tl_update_tbl-tl_bias + 1, reshape tl_update_tbl, set to 1 + // tbl_tl_bias = tl_update copy high part to tbl_tl_bias high part, stride w = 2 + cvm_emit_mul_const(ctx, tl_threshold, tl_threshold, tl_threshold->fmt, 0); + cvm_emit_mul_const(ctx, tl_update_tbl, tl_update_tbl, tl_update_tbl->fmt, 0); + + cvk_tl_stride_t tl_update_tbl_st = tl_update_tbl->stride; + tl_update_tbl->stride = tl_threshold->stride; + tl_update_tbl->shape = tl_threshold->shape; + + cvk_tiu_add_param_t p4; + p4.res_high = 0; + p4.res_low = tl_threshold; + p4.a_high = tl_update_tbl; + p4.a_low = tl_threshold; + p4.b_is_const = 1; + p4.b_const.val = 1; + p4.b_const.is_signed = 1; + p4.rshift_bits = 0; + p4.relu_enable = 0; + ctx->ops->tiu_add(ctx, &p4); + + // clean high part + tl_threshold->start_address++; // continuous low/high + tl_update_tbl->shape.n = 1; + cvm_emit_mul_const(ctx, tl_threshold, tl_threshold, tl_threshold->fmt, 0); + tl_threshold->start_address--; // restore + + // restore + tl_update_tbl->fmt = tl_mask->fmt = fmt; + tl_update_tbl->stride = tl_update_tbl_st; + tl_update_tbl->shape = tl_mask->shape; + + // set to -1 for \cvm_set_image_by_u8mask_dp + cvm_emit_mul_const(ctx, tl_kernel, tl_kernel, tl_kernel->fmt, -1); + + return cvm_set_image_by_u8mask_dp(ctx, tl_ifmap, tl_mask, tl_kernel, tl_threshold, tl_ofmap); +} + +int cvm_gen_image_diff(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_ifmap2, + cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2, cvk_tl_t* tl_ofmap) { + int ret = 0; + cvk_fmt_t fmt = tl_ifmap->fmt; + if (!check_u8(tl_ifmap, tl_ifmap2, tl_ofmap)) { + // throw config error + printf("input/buf/ofmap format should config CVK_FMT_U8\n"); + return -1; + } + + if (!check_same_fmt(tl_ifmap, tl_buf, tl_buf2)) { + printf("all tensor's fmt shoud be equal\n"); + return -2; + } + + if (tl_ofmap->fmt == CVK_FMT_BF16) { + // TODO: support it + } else if (tl_ofmap->fmt == CVK_FMT_U8) { + } else { + printf("not support fmt\n"); + return -3; + } + + // get large one + cvk_tiu_max_param_t p13 = {0}; + p13.max = tl_buf; + p13.a = tl_ifmap; + p13.b_is_const = 0; + p13.b = tl_ifmap2; + p13.layer_id = 0; + ctx->ops->tiu_max(ctx, &p13); + + // compare to get a > b or a < b, 1 means a > b + // cvk_tiu_sub_param_t p5; + // p5.res_high = 0; // saturatue to int8 + // p5.res_low = tl_ofmap; + // p5.a_high= tl_buf2; + // p5.a_low = tl_buf; + // p5.b_high = tl_buf2; + // p5.b_low = tl_ifmap2; + // p5.rshift_bits = 0; + // ctx->ops->tiu_sub(ctx, &p5); + tl_ifmap2->fmt = tl_buf->fmt = tl_buf2->fmt = CVK_FMT_I8; + cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, tl_buf2->fmt, 0); + cvk_tiu_mac_param_t p2; + p2.res_high = tl_buf2; + p2.res_low = tl_buf; + p2.res_is_int8 = 0; // keep origin + p2.a = tl_ifmap2; + p2.b_is_const = 1; + p2.b_const.val = -1; + p2.b_const.is_signed = 1; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + ctx->ops->tiu_mac(ctx, &p2); + + // mul 255 and rightshift to get 0/1, 1 means tl_ifmap > tl_ifmap2 + // get positive + tl_buf->fmt = CVK_FMT_U8; + cvk_tiu_mul_param_t p1 = {0}; + p1.res_high = 0; + p1.res_low = tl_buf; + p1.a = tl_buf; + p1.b_const.val = 255; + p1.b_const.is_signed = 0; + p1.b_is_const = true; + p1.rshift_bits = 0; + p1.layer_id = 0; + p1.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p1); + + // that max = 127 + tl_buf->fmt = CVK_FMT_I8; + p1.res_high = 0; + p1.res_low = tl_buf; + p1.a = tl_buf; + p1.b_const.val = -127; + p1.b_const.is_signed = 1; + p1.b_is_const = true; + p1.rshift_bits = 0; + p1.layer_id = 0; + p1.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p1); + + // 127 >> 7 to 0/1 + p1.res_high = 0; + p1.res_low = tl_buf; + p1.a = tl_buf; + p1.b_const.val = 1; + p1.b_const.is_signed = 1; + p1.b_is_const = true; + p1.rshift_bits = 7; + p1.layer_id = 0; + p1.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p1); + + tl_ifmap->fmt = tl_ofmap->fmt = tl_buf->fmt = CVK_FMT_I8; + // keep a that a > b + cvm_emit_mul(ctx, tl_buf, tl_ifmap, tl_ofmap, tl_ofmap->fmt); + + // mul -1 for get - b under a > b + cvm_emit_mul_const(ctx, tl_buf, tl_buf, tl_buf->fmt, -1); + + // get a - b = a + (-1) * b + // cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, tl_buf2->fmt, 0); + p2.res_high = tl_buf2; // dont care add garbage + p2.res_low = tl_ofmap; + p2.res_is_int8 = 0; // keep origin + p2.a = tl_ifmap2; + p2.b_is_const = 0; + p2.b = tl_buf; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + ctx->ops->tiu_mac(ctx, &p2); + cvm_emit_mul_const(ctx, tl_ofmap, tl_ofmap, tl_ofmap->fmt, 1); + + // hoist to int16 + tl_buf2->fmt = CVK_FMT_I8; + p1.res_high = tl_buf2; + p1.res_low = tl_buf; + p1.a = tl_buf; + p1.b_const.val = 1; + p1.b_const.is_signed = 1; + p1.b_is_const = true; + p1.rshift_bits = 0; + p1.layer_id = 0; + p1.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p1); + + // get revert 0/-1 to 1/0, get a < b case + cvk_tiu_add_param_t p4; + p4.res_high = 0; + p4.res_low = tl_buf; + p4.a_high = tl_buf2; + p4.a_low = tl_buf; + p4.b_is_const = 1; + p4.b_const.val = 1; + p4.b_const.is_signed = 1; + p4.rshift_bits = 0; + p4.relu_enable = 0; + ctx->ops->tiu_add(ctx, &p4); + + // remove a < b in b + cvm_emit_mul(ctx, tl_buf, tl_ifmap2, tl_ifmap2, tl_ifmap2->fmt); + + // mul -1 for -a + cvm_emit_mul_const(ctx, tl_buf, tl_buf, tl_buf->fmt, -1); + + // aops->tiu_mac(ctx, &p2); + + // output is u8, a > b part merge a < b + p2.res_high = tl_buf2; // dont care add garbage + p2.res_low = tl_ofmap; + p2.res_is_int8 = 0; // keep origin + p2.a = tl_ifmap2; + p2.b_is_const = 1; + p2.b_const.val = 1; + p2.b_const.is_signed = 0; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + ctx->ops->tiu_mac(ctx, &p2); + + // restore + tl_buf->fmt = tl_buf2->fmt = tl_ifmap->fmt = tl_ifmap2->fmt = tl_ofmap->fmt = fmt; + + return ret; +} + +int cvm_gen_image_diff_dp(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_ifmap2, + cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2, cvk_tl_t* tl_ofmap) { + int ret = 0; + cvk_fmt_t fmt = tl_ifmap->fmt; + if (!check_u8(tl_ifmap, tl_ifmap2, tl_ofmap)) { + // throw config error + printf("input/buf/ofmap format should config CVK_FMT_U8\n"); + return -1; + } + + if (!check_same_fmt(tl_ifmap, tl_buf, tl_buf2)) { + printf("all tensor's fmt shoud be equal\n"); + return -2; + } + + if (tl_ofmap->fmt == CVK_FMT_BF16) { + // TODO: support it + } else if (tl_ofmap->fmt == CVK_FMT_U8) { + } else { + printf("not support fmt\n"); + return -3; + } + + // tl_buf = max(\tl_ifmap, \tl_ifmap2) + // tl_buf2-tl_buf = tl_buf2-tl_buf + (- 1 * tl_ifmap2), == 0 means \tl_ifmap < \tl_ifmap2, + // otherwise \tl_ifmap > \tl_ifmap2 tl_buf = tl_buf * 255 to get 0/1, 1 means \tl_ifmap > + // \tl_ifmap3 get large one + cvk_tiu_max_param_t p13 = {0}; + p13.max = tl_buf; + p13.a = tl_ifmap; + p13.b_is_const = 0; + p13.b = tl_ifmap2; + p13.layer_id = 0; + ctx->ops->tiu_max(ctx, &p13); + + // compare to get a > b or a < b, 1 means a > b + // cvk_tiu_sub_param_t p5; + // p5.res_high = 0; // saturatue to int8 + // p5.res_low = tl_ofmap; + // p5.a_high= tl_buf2; + // p5.a_low = tl_buf; + // p5.b_high = tl_buf2; + // p5.b_low = tl_ifmap2; + // p5.rshift_bits = 0; + // ctx->ops->tiu_sub(ctx, &p5); + tl_ifmap2->fmt = tl_buf->fmt = tl_buf2->fmt = CVK_FMT_I8; + cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, tl_buf2->fmt, 0); + cvk_tiu_mac_param_t p2; + p2.res_high = tl_buf2; + p2.res_low = tl_buf; + p2.res_is_int8 = 0; // keep origin + p2.a = tl_ifmap2; + p2.b_is_const = 1; + p2.b_const.val = -1; + p2.b_const.is_signed = 1; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + ctx->ops->tiu_mac(ctx, &p2); + + // mul 255 and rightshift to get 0/1, 1 means tl_ifmap > tl_ifmap2 + // get positive + tl_buf->fmt = CVK_FMT_U8; + cvk_tiu_mul_param_t p1 = {0}; + p1.res_high = 0; + p1.res_low = tl_buf; + p1.a = tl_buf; + p1.b_const.val = 255; + p1.b_const.is_signed = 0; + p1.b_is_const = true; + p1.rshift_bits = 0; + p1.layer_id = 0; + p1.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p1); + + // that max = 127 + tl_buf->fmt = CVK_FMT_I8; + p1.res_high = 0; + p1.res_low = tl_buf; + p1.a = tl_buf; + p1.b_const.val = -127; + p1.b_const.is_signed = 1; + p1.b_is_const = true; + p1.rshift_bits = 0; + p1.layer_id = 0; + p1.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p1); + + // 127 >> 7 to 0/1 + p1.res_high = 0; + p1.res_low = tl_buf; + p1.a = tl_buf; + p1.b_const.val = 1; + p1.b_const.is_signed = 1; + p1.b_is_const = true; + p1.rshift_bits = 7; + p1.layer_id = 0; + p1.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p1); + + tl_ifmap->fmt = tl_ofmap->fmt = tl_buf->fmt = CVK_FMT_I8; + // keep a that a > b + cvm_emit_mul(ctx, tl_buf, tl_ifmap, tl_ofmap, tl_ofmap->fmt); + + // mul -1 for get - b under a > b + cvm_emit_mul_const(ctx, tl_buf, tl_buf, tl_buf->fmt, -1); + + // get a - b = a + (-1) * b + // cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, tl_buf2->fmt, 0); + p2.res_high = tl_buf2; // dont care add garbage + p2.res_low = tl_ofmap; + p2.res_is_int8 = 0; // keep origin + p2.a = tl_ifmap2; + p2.b_is_const = 0; + p2.b = tl_buf; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + ctx->ops->tiu_mac(ctx, &p2); + cvm_emit_mul_const(ctx, tl_ofmap, tl_ofmap, tl_ofmap->fmt, 1); + + // hoist to int16 + tl_buf2->fmt = CVK_FMT_I8; + p1.res_high = tl_buf2; + p1.res_low = tl_buf; + p1.a = tl_buf; + p1.b_const.val = 1; + p1.b_const.is_signed = 1; + p1.b_is_const = true; + p1.rshift_bits = 0; + p1.layer_id = 0; + p1.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p1); + + // get revert 0/-1 to 1/0, get a < b case + cvk_tiu_add_param_t p4; + p4.res_high = 0; + p4.res_low = tl_buf; + p4.a_high = tl_buf2; + p4.a_low = tl_buf; + p4.b_is_const = 1; + p4.b_const.val = 1; + p4.b_const.is_signed = 1; + p4.rshift_bits = 0; + p4.relu_enable = 0; + ctx->ops->tiu_add(ctx, &p4); + + // remove a < b in b + cvm_emit_mul(ctx, tl_buf, tl_ifmap2, tl_ifmap2, tl_ifmap2->fmt); + + // mul -1 for -a + cvm_emit_mul_const(ctx, tl_buf, tl_buf, tl_buf->fmt, -1); + + // aops->tiu_mac(ctx, &p2); + + // output is u8, a > b part merge a < b + p2.res_high = tl_buf2; // dont care add garbage + p2.res_low = tl_ofmap; + p2.res_is_int8 = 0; // keep origin + p2.a = tl_ifmap2; + p2.b_is_const = 1; + p2.b_const.val = 1; + p2.b_const.is_signed = 0; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + ctx->ops->tiu_mac(ctx, &p2); + + // restore + tl_buf->fmt = tl_buf2->fmt = tl_ifmap->fmt = tl_ifmap2->fmt = tl_ofmap->fmt = fmt; + + return ret; +} +int cvm_update_tbl_by_threshold(cvk_context_t* ctx, cvk_tl_t* tl_mask, cvk_tl_t* tl_buf, + cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_update_tbl, + uint8_t threshold_a, uint8_t threshold_b, cvk_tl_t* tl_ofmap) { + int ret = 0; + (void)threshold_b; + (void)tl_mask; + (void)tl_buf2; + (void)tl_buf3; + (void)tl_update_tbl; + cvk_fmt_t fmt = tl_ofmap->fmt; + if (!check_u8(tl_ofmap, tl_buf, tl_buf)) { + // throw config error + printf("ofmap/buf format should config CVK_FMT_U8\n"); + return -1; + } + + if (!check_same_fmt(tl_ofmap, tl_buf, tl_buf)) { + printf("all tensor's fmt shoud be equal\n"); + return -2; + } + + cvk_tl_t* high = tl_buf; + if (tl_ofmap->fmt == CVK_FMT_BF16) { + high = NULL; + } else if (tl_ofmap->fmt == CVK_FMT_U8) { + // hw limitation that input should be i8 + // cvm_emit_mul_const(ctx, high, high, high->fmt, 0); + } else { + printf("not support fmt\n"); + return -3; + } + + // mask = get_less_u8(diff[i], thresh_a) + _get_less_mask(ctx, high, tl_buf2, tl_update_tbl, threshold_a); + + // mask_1 = get_less_i8(update_tbl[i], thresh_b), 0/1 + tl_buf2->fmt = tl_ofmap->fmt = tl_buf3->fmt = CVK_FMT_I8; + _get_less_mask(ctx, tl_buf2, tl_buf3, tl_ofmap, threshold_b); + + // mask_2 = mask * mask_1 // keep for next triple if-else + // tl_update_tbl as buf + tl_update_tbl->fmt = tl_buf->fmt = CVK_FMT_I8; + cvm_emit_mul(ctx, tl_buf, tl_buf2, tl_update_tbl, tl_buf2->fmt); + + cvm_emit_mul_const(ctx, tl_update_tbl, tl_update_tbl, tl_buf2->fmt, -1); + cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, tl_buf3->fmt, 0); // diff 0 used + // update_tbl[i] = update_tbl[i] - mask_2 * update_tbl[i], set 0 + // sub itself leverage int16 is ok, plz refer \cvm_set_image_by_u8mask + cvk_tiu_mac_param_t p2; + p2.res_high = tl_buf3; // diff itsef MUST set high part as 0 + p2.res_low = tl_ofmap; + p2.res_is_int8 = 0; // keep origin + p2.a = tl_update_tbl; + p2.b_is_const = 0; + p2.b = tl_ofmap; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + ctx->ops->tiu_mac(ctx, &p2); + + // mask_1 = (mask_1 - 1), hoist it + cvk_tiu_mul_param_t p1 = {0}; + p1.res_high = tl_buf3; + p1.res_low = tl_buf2; + p1.a = tl_buf2; + p1.b_const.val = 1; + p1.b_const.is_signed = 1; + p1.b_is_const = true; + p1.rshift_bits = 0; + p1.layer_id = 0; + p1.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p1); + + cvk_tiu_add_param_t p4; + p4.res_high = 0; + p4.res_low = tl_buf2; + p4.a_high = tl_buf3; + p4.a_low = tl_buf2; + p4.b_is_const = 1; + p4.b.high = 0; + p4.b_const.val = -1; + p4.b_const.is_signed = 1; + p4.rshift_bits = 0; + p4.relu_enable = 0; + ctx->ops->tiu_add(ctx, &p4); + + // mask_2 = mask * mask_1 + cvm_emit_mul(ctx, tl_buf, tl_buf2, tl_update_tbl, tl_buf2->fmt); + + // update_tbl[i] = update_tbl[i] + mask_2 // (update_tbl[i]-1) + // int8, hoist it + p1.res_high = tl_buf3; + p1.res_low = tl_ofmap; + p1.a = tl_ofmap; + p1.b_const.val = 1; + p1.b_const.is_signed = 1; + p1.b_is_const = true; + p1.rshift_bits = 0; + p1.layer_id = 0; + p1.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p1); + + p2.res_high = tl_buf3; + p2.res_low = tl_ofmap; + p2.res_is_int8 = 1; // keep origin + p2.a = tl_update_tbl; + p2.b_is_const = 1; + p2.b_const.val = 1; + p2.b_const.is_signed = 1; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + ctx->ops->tiu_mac(ctx, &p2); + + // mask = (mask - 1) * -1 // export, rever 0/1 to 1/0 + cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, tl_buf3->fmt, 0); // diff 0 used + p4.res_high = 0; + p4.res_low = tl_buf; + p4.a_high = tl_buf3; + p4.a_low = tl_buf; + p4.b_is_const = 1; + p4.b.high = 0; + p4.b_const.val = -1; + p4.b_const.is_signed = 1; + p4.rshift_bits = 0; + p4.relu_enable = 0; + ctx->ops->tiu_add(ctx, &p4); + + cvm_emit_mul_const(ctx, tl_buf, tl_buf, tl_buf->fmt, -1); + + // update_tbl[i] = update_tbl[i] + mask // update_tbl[i]++, return + // int8, hoist it + p1.res_high = tl_buf3; + p1.res_low = tl_ofmap; + p1.a = tl_ofmap; + p1.b_const.val = 1; + p1.b_const.is_signed = 1; + p1.b_is_const = true; + p1.rshift_bits = 0; + p1.layer_id = 0; + p1.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p1); + + p2.res_high = tl_buf3; + p2.res_low = tl_ofmap; + p2.res_is_int8 = 1; // keep origin + p2.a = tl_buf; + p2.b_is_const = 1; + p2.b_const.val = 1; + p2.b_const.is_signed = 1; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + ctx->ops->tiu_mac(ctx, &p2); + + // restore + tl_buf2->fmt = tl_buf3->fmt = tl_ofmap->fmt = tl_update_tbl->fmt = tl_buf->fmt = fmt; + + return ret; +} + +int cvm_set_image_by_two_info_u8(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* tl_buf2, cvk_tl_t* tl_update_tbl, uint8_t threshold, + cvk_tl_t* tl_ofmap) { + cvk_fmt_t fmt = tl_ifmap->fmt; + if (!check_u8(tl_ifmap, tl_buf, tl_update_tbl)) { + // throw config error + printf("input/buf/tl_update_tbl format should config CVK_FMT_U8\n"); + return -1; + } + + if (!check_same_fmt(tl_ifmap, tl_buf, tl_buf2)) { + printf("all tensor's fmt shoud be equal\n"); + return -2; + } + + cvk_tl_t* high = tl_buf; + if (tl_ofmap->fmt == CVK_FMT_BF16) { + high = NULL; + } else if (tl_ofmap->fmt == CVK_FMT_U8) { + // hw limitation that input should be i8 + // tl_buf2->fmt = tl_update_tbl->fmt = tl_ofmap->fmt = tl_buf->fmt = CVK_FMT_I8; + } else { + printf("not support fmt\n"); + return -3; + } + + // large equal, u8 compare + _get_large_mask(ctx, high, tl_buf2, tl_update_tbl, threshold - 1); + // return 0; + + // restore + tl_buf2->fmt = tl_update_tbl->fmt = tl_buf->fmt = tl_ofmap->fmt = fmt; + + return cvm_set_image_by_u8mask(ctx, tl_ifmap, tl_buf2, high, tl_ofmap); +} + +int cvm_blend_image_by_tbl(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* tl_buf2, cvk_tl_t* tl_update_tbl, uint8_t threshold, + uint8_t w1, uint8_t w2, cvk_tl_t* tl_ofmap) { + int ret = 0; + cvk_fmt_t fmt = tl_ifmap->fmt; + if (!check_u8(tl_ifmap, tl_buf, tl_ofmap) || !check_u8(tl_buf2, tl_update_tbl, tl_update_tbl)) { + // throw config error + printf("input/input1/input2/tl_update_tbl/buf/ofmap format should config CVK_FMT_U8\n"); + return -1; + } + + if (!check_same_fmt(tl_ifmap, tl_buf, tl_update_tbl) && + !check_same_fmt(tl_buf2, tl_update_tbl, tl_ofmap)) { + printf("all tensor's fmt shoud be equal\n"); + return -2; + } + + cvk_tl_t* high = tl_buf; + if (tl_ofmap->fmt == CVK_FMT_BF16) { + // TODO: support it + high = NULL; + } else if (tl_ofmap->fmt == CVK_FMT_U8) { + // hw limitation that input should be i8 + tl_buf2->fmt = tl_buf->fmt = tl_update_tbl->fmt = CVK_FMT_I8; + cvm_emit_mul_const(ctx, high, high, high->fmt, 0); + } else { + printf("not support fmt\n"); + return -3; + } + + // get g_update_tbl[i]>threshold + _get_large_mask(ctx, high, tl_buf2, tl_update_tbl, threshold); + + // dirty tl_update_tbl + // TODO: not copy again + cvm_emit_mul_const(ctx, high, tl_update_tbl, tl_buf->fmt, 1); + + tl_buf2->fmt = tl_buf->fmt = tl_ofmap->fmt = CVK_FMT_U8; + // ofmap * w1, keep high part + cvk_tiu_mul_param_t p; + p.res_high = tl_buf2; + p.res_low = tl_buf; + p.a = tl_ofmap; + p.b_is_const = 1; + p.b_const.val = w1; + p.b_const.is_signed = 0; + p.rshift_bits = 0; + p.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p); + + // buf2 = buf2 + w2*pY[i], i16 output, it should be >= 0 + cvk_tiu_mac_param_t p2; + p2.res_high = tl_buf2; + p2.res_low = tl_buf; + p2.res_is_int8 = 0; // keep origin + p2.a = tl_ifmap; + p2.b_is_const = 1; + p2.b_const.val = w2; + p2.b_const.is_signed = 0; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + ctx->ops->tiu_mac(ctx, &p2); + + // keep update_tbl[i]>threshold mask, it ok for signed that just keep data + tl_buf2->fmt = tl_update_tbl->fmt = CVK_FMT_I8; + cvm_emit_mul(ctx, tl_buf2, tl_update_tbl, tl_buf2, tl_update_tbl->fmt); + + // mul -1 for sub it, dirty tl_update_tbl + cvm_emit_mul_const(ctx, tl_update_tbl, tl_update_tbl, tl_update_tbl->fmt, -1); + + high->fmt = tl_ofmap->fmt = CVK_FMT_I8; + // NOTICE: we only keep low part as U8 + // set update_tbl[i]>threshold as 0 + // sub itself leverage int16 is ok, plz refer \cvm_set_image_by_u8mask + p2.res_high = high; + p2.res_low = tl_ofmap; + p2.res_is_int8 = 0; // keep origin + p2.a = tl_ofmap; + p2.b_is_const = 0; + p2.b = tl_update_tbl; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + ctx->ops->tiu_mac(ctx, &p2); + + // ouput as U8, get high part + tl_buf->fmt = tl_ofmap->fmt = tl_buf2->fmt = CVK_FMT_U8; + p2.res_high = high; // dont care + p2.res_low = tl_ofmap; + p2.res_is_int8 = 0; // keep origin + p2.a = tl_buf2; + p2.b_is_const = 1; + p2.b_const.val = 1; + p2.b_const.is_signed = 0; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + ctx->ops->tiu_mac(ctx, &p2); + + // restore + tl_buf2->fmt = tl_ifmap->fmt = tl_buf->fmt = tl_update_tbl->fmt = tl_ofmap->fmt = fmt; + + return ret; +} diff --git a/cvimath/src/tiu_lut_atan.c b/cvimath/src/tiu_lut_atan.c new file mode 100644 index 000000000..5746284fc --- /dev/null +++ b/cvimath/src/tiu_lut_atan.c @@ -0,0 +1,1106 @@ +/** + * plz refer [git](https://github.com/xiezhq-hermann/atan_lookup) + * input range is `all real numbers` and output range is -pi/2 < x < pi/2, + * you can refer [here](https://www.mathopenref.com/arctan.html) for more details + */ +// +// xiezhq@shanghaitech.edu.cn && wanghe@shanghaitech.edu.cn +/* Reference: + [1] Abhisek Ukil, Vishal H Shah, Bernhard Deck, + "Fast Computation of arctangent Functions for Embedded Applications: A + Comparative Analysis" IEEE International Symposium on Industrial Electronics, +Pages: 1206 - 1211, DOI: 10.1109/ISIE.2011.5984330, 2011 +[2] Sreeraman Rajan, Sichun Wang, Robert Inkol, and Alain Joyal +"Efficient Approximations for the Arctangent Function" +IEEE SIGNAL PROCESSING MAGAZINE [108] MAY 2006 +*/ + +#include +#include "gen_lut.h" // NOLINT + +//#define DBG + +static double LUT_d[102] = {0, + 0.00999966668666524, + 0.0199973339731505, + 0.0299910048568779, + 0.0399786871232900, + 0.0499583957219428, + 0.0599281551212079, + 0.0698860016346425, + 0.0798299857122373, + 0.0897581741899505, + 0.0996686524911620, + 0.109559526773944, + 0.119428926018338, + 0.129275004048143, + 0.139095941482071, + 0.148889947609497, + 0.158655262186401, + 0.168390157147530, + 0.178092938231198, + 0.187761946513593, + 0.197395559849881, + 0.206992194219821, + 0.216550304976089, + 0.226068387993884, + 0.235544980720863, + 0.244978663126864, + 0.254368058553266, + 0.263711834462266, + 0.273008703086711, + 0.282257421981491, + 0.291456794477867, + 0.300605670042395, + 0.309702944542456, + 0.318747560420644, + 0.327738506780556, + 0.336674819386727, + 0.345555580581712, + 0.354379919123438, + 0.363147009946176, + 0.371856073848581, + 0.380506377112365, + 0.389097231055278, + 0.397627991522129, + 0.406098058317616, + 0.414506874584786, + 0.422853926132941, + 0.431138740718782, + 0.439360887284591, + 0.447519975157170, + 0.455615653211225, + 0.463647609000806, + 0.471615567862328, + 0.479519291992596, + 0.487358579505190, + 0.495133263468404, + 0.502843210927861, + 0.510488321916776, + 0.518068528456721, + 0.525583793551610, + 0.533034110177490, + 0.540419500270584, + 0.547740013715902, + 0.554995727338587, + 0.562186743900029, + 0.569313191100662, + 0.576375220591184, + 0.583373006993856, + 0.590306746935372, + 0.597176658092678, + 0.603982978252998, + 0.610725964389209, + 0.617405891751573, + 0.624023052976757, + 0.630577757214935, + 0.637070329275684, + 0.643501108793284, + 0.649870449411948, + 0.656178717991395, + 0.662426293833151, + 0.668613567927821, + 0.674740942223553, + 0.680808828915828, + 0.686817649758645, + 0.692767835397122, + 0.698659824721463, + 0.704494064242218, + 0.710271007486686, + 0.715991114416300, + 0.721654850864761, + 0.727262687996690, + 0.732815101786507, + 0.738312572517228, + 0.743755584298860, + 0.749144624606017, + 0.754480183834406, + 0.759762754875771, + 0.764992832710910, + 0.770170914020331, + 0.775297496812126, + 0.780373080066636, + 0.785398163397448, + 0.790373246728302}; + +void cvm_atan_y0(uint16_t* table_data_y0, cvk_tl_shape_t* table_shape) { + ASSERT(is_1880v2_tbl_shape(table_shape)); + + int table_hw = cvm_table_hw(); + + /** + * index 0 1 2 3 60 61 62 63 64 65 123 124 125 126 + *-------- + * exp (2) x -62 -61 -60 ... -3 -2 -1 0 1 2 .... 60 61 62 63 + * + * index 128 129 130 131 188 189 190 191 192 193 251 252 253 254 255 + *-------- + * exp (-2)x -62 -61 -60 ... -3 -2 -1 0 1 2 ... 60 61 62 63 x + * + */ + + // [0 102) for > 1 + int lut_sz = sizeof(LUT_d) / sizeof(LUT_d[0]); + for (int i = 0; i < lut_sz; i++) { + table_data_y0[i] = convert_fp32_bf16(M_PI_2 - LUT_d[i]); + } + + // [102 204) for [0 1] + for (int i = lut_sz; i < lut_sz * 2; i++) { + table_data_y0[i] = convert_fp32_bf16(LUT_d[i - lut_sz]); + } + +#ifdef DBG + for (int i = 0; i < lut_sz * 2; i++) { + printf("y0[%d] is %f(0x%x)\n", i, convert_bf16_fp32(table_data_y0[i]), table_data_y0[i]); + } +#endif + // duplicate channel #1 to #31 + // TODO: tensor copy + for (uint32_t i = 1; i < table_shape->c; i++) { + memcpy(&table_data_y0[i * table_hw], &table_data_y0[0], sizeof(uint16_t) * table_hw); + } +} + +void cvm_atan_fast_degree_y0(uint16_t* table_data_y0, cvk_tl_shape_t* table_shape) { + ASSERT(is_1880v2_tbl_shape(table_shape)); + + int table_hw = cvm_table_hw(); + + /** + * index 0 1 2 3 60 61 62 63 64 65 123 124 125 126 + *-------- + * exp (2) x -62 -61 -60 ... -3 -2 -1 0 1 2 .... 60 61 62 63 + * + * index 128 129 130 131 188 189 190 191 192 193 251 252 253 254 255 + *-------- + * exp (-2)x -62 -61 -60 ... -3 -2 -1 0 1 2 ... 60 61 62 63 x + * + */ + + // [0 102) for > 1 + int lut_sz = sizeof(LUT_d) / sizeof(LUT_d[0]); + for (int i = 0; i < lut_sz; i++) { + table_data_y0[i] = convert_fp32_bf16((M_PI_2 - LUT_d[i]) * 180 / M_PI); + } + + // [102 204) for [0 1] + for (int i = lut_sz; i < lut_sz * 2; i++) { + table_data_y0[i] = convert_fp32_bf16(LUT_d[i - lut_sz] * 180 / M_PI); + } + +#ifdef DBG + for (int i = 0; i < lut_sz * 2; i++) { + printf("y0[%d] is %f(0x%x)\n", i, convert_bf16_fp32(table_data_y0[i]), table_data_y0[i]); + } +#endif + // duplicate channel #1 to #31 + // TODO: tensor copy + for (uint32_t i = 1; i < table_shape->c; i++) { + memcpy(&table_data_y0[i * table_hw], &table_data_y0[0], sizeof(uint16_t) * table_hw); + } +} + +void cvm_atan_slope(uint16_t* OUT table_slope, cvk_tl_shape_t* table_shape) { + int table_hw = cvm_table_hw(); + + int lut_sz = sizeof(LUT_d) / sizeof(LUT_d[0]) - 1; + for (volatile int i = 0; i < lut_sz; i++) { + table_slope[i] = convert_fp32_bf16(LUT_d[i + 1] - LUT_d[i]); + } + + // duplicate channel #1 to #31 + // TODO: tensor copy + for (uint64_t i = 1; i < table_shape->c; i++) { + memcpy(&table_slope[table_hw * i], &table_slope[0], sizeof(uint16_t) * table_hw); + } +} + +// 'bf16_atan_s_01' means atan split [0 1] and (1, +// data in [0-1] mutilply 1, > 1 mutiply with -1 +void cvm_atan_s_01(uint16_t* OUT table_invert, cvk_tl_shape_t* table_shape) { + int half = half_h_table(); + int table_hw = cvm_table_hw(); + + // data in [0, 1], mutilply 1 +#if 1 + for (uint32_t i = 0; i < 63; i++) { + table_invert[i] = convert_fp32_bf16(1.0); + table_invert[i + half] = convert_fp32_bf16(1.0); + } + + // data > 1 + for (int i = 63; i < half; i++) { + table_invert[i] = convert_fp32_bf16(-1.0); + table_invert[i + half] = convert_fp32_bf16(-1.0); + } +#endif + + // duplicate channel #1 to #31 + // TODO: tensor copy + for (uint64_t i = 1; i < table_shape->c; i++) { + memcpy(&table_invert[table_hw * i], &table_invert[0], sizeof(uint16_t) * table_hw); + } +} + +// 'pos_neg' means data is positive(>=0) is 1 or negtive(<0) is -1 +void cvm_pos_neg_tbl(uint16_t* OUT table_pos_neg, cvk_tl_shape_t* table_shape) { + uint32_t half = half_h_table(); + int table_hw = cvm_table_hw(); + + // data >= 0 + for (uint32_t i = 0; i < half; i++) { + table_pos_neg[i] = convert_fp32_bf16(1.0); + } + + // data < 0 + for (uint32_t i = half; i < half * 2; i++) { + table_pos_neg[i] = convert_fp32_bf16(-1.0); + } + + // duplicate channel #1 to #31 + // TODO: tensor copy + for (uint64_t i = 1; i < table_shape->c; i++) { + memcpy(&table_pos_neg[table_hw * i], &table_pos_neg[0], sizeof(uint16_t) * table_hw); + } +} + +void cvm_atan_pos_neg(uint16_t* OUT table_pos_neg, cvk_tl_shape_t* table_shape) { + cvm_pos_neg_tbl(table_pos_neg, table_shape); +} + +/* Syntactic sugar for get more precision + * raw implement code : + + double re_x = 1 / x; + int index = round(re_x * 100); + return (M_PI_2 - (LUT_d[index] + (re_x * 100 - index) * (LUT_d[index + 1] - LUT_d[index]))); + and we want to get `(LUT_d[index] + (re_x * 100 - index)` part + */ +int cvm_atan_slope_multipilier(cvk_context_t* ctx, cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2, + cvk_tl_t* tl_buf3, cvk_tl_t* tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16, + cvk_fmt_t fmt) { + (void)fmt; + cvm_get_dec(ctx, tl_buf, tl_buf2, tl_buf3); + // z = (min(x,y) * 100 - index) * slope(index) + + // fill to 100 + cvk_tiu_mul_param_t p1; + p1.res_high = NULL; + p1.res_low = tl_buf2; + p1.a = tl_buf; + p1.b_is_const = 1; + p1.b_const.val = convert_fp32_bf16(0); + p1.rshift_bits = 0; + p1.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p1); + + // add + cvk_tiu_add_param_t p4; + p4.res_high = 0; + p4.res_low = tl_buf2; + p4.a_high = 0; + p4.a_low = tl_buf2; + p4.b_is_const = 1; + p4.b.high = 0; + p4.b_const.val = convert_fp32_bf16(-100.0); + p4.rshift_bits = 0; + p4.relu_enable = 0; + + ctx->ops->tiu_add(ctx, &p4); + + cvk_tiu_mac_param_t p2; + p2.res_high = 0; + p2.res_low = tl_buf3; + p2.res_is_int8 = 0; + p2.a = tl_ifmap; + p2.b_is_const = 0; + p2.b = tl_buf2; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + + ctx->ops->tiu_mac(ctx, &p2); + + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_buf3; + p1.b_is_const = 1; + p1.b_const.val = convert_fp32_bf16(-1.0); + p1.rshift_bits = 0; + p1.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p1); + + return 0; +} + +/** issue atan >= 0 + * \b for more precision, we use mac for atan2 + * if (x > 1) { + * x = 1 / x + * } + * int index = round(x * 100); + * double r = (x * 100 - index) * (LUT_d[index + 1] - LUT_d[index]); + * double shift = LUT_d[index]; + * if (x > 1) { + * shift = M_PI_2 - LUT_d[index]; + * } + * return r + shift; + * FIXME: reduce temp buffer count + */ +int _cvm_atan_emit(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2, + cvk_tl_t* tl_buf3, cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_slope_buf, + cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_buf, cvk_tl_t* tl_table_answer, + cvk_tl_t* tl_table_answer_mantissa, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, + float b) { + cvm_table_check(tl_ifmap, tl_y0_buf, tl_slope_buf, tl_ifmap); + cvm_table_check(tl_buf, tl_invert_buf, tl_pos_neg_buf, tl_buf2); + cvm_table_check(tl_buf3, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16); + + // x = abs(x0) + // y = 1 / x + // index = 100 * min(x, y) + // z = (min(x,y) * 100 - index) * slope(index) + // invert = table_0_102(x) ([0-1] return 1, >1 return -1) + // invert = invert * z + // t = 64 * (table_0_102 + 1) + // shift_index = t(index) ([0-1] return 102, >1 return 0) + // shift = y0(shift_index + index) ([0-1] return LUT_d[index], >1 return M_PI_2 - LUT_d[index] + // p = table_pos_neg(x0) (>= 0 return 1, < 0 return -1) + // p(shift + invert * z) + + cvm_emit_abs(ctx, tl_ifmap, tl_buf, fmt); + + // y = 1 / x + cvm_emit_reciprocal(ctx, tl_buf, tl_buf2, tl_table_answer, tl_table_answer_mantissa, + tl_ofmap_bf16); + + cvk_tiu_min_param_t p7; + p7.min = tl_ofmap_bf16; + p7.a = tl_buf; + p7.b_is_const = 0; + p7.b = tl_ofmap_bf16; + + ctx->ops->tiu_min(ctx, &p7); + + // get index + cvk_tiu_mul_param_t p1; + p1.res_high = NULL; + p1.res_low = tl_buf; + p1.a = tl_ofmap_bf16; + p1.b_is_const = 1; + p1.b_const.val = convert_fp32_bf16(100.0); + p1.rshift_bits = 0; + p1.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p1); + + cvm_atan_slope_multipilier(ctx, tl_buf, tl_buf2, tl_buf3, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // get int8 index of x2 + cvm_get_uint8_t_tbl_idx(ctx, tl_buf, tl_buf2); + + // x0 = base[x2] + (0.x * (slope[x2]) + // TODO: use mac + + // get slope[x2] + cvk_tiu_lookup_table_param_t p12; + p12.ofmap = tl_buf3; + p12.ifmap = tl_buf2; + p12.table = tl_slope_buf; + ctx->ops->tiu_lookup_table(ctx, &p12); + + // z = (min(x,y) * 100 - index) * slope(index) + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_ofmap_bf16; + p1.b_is_const = 0; + p1.b = tl_buf3; + p1.rshift_bits = 0; + p1.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p1); + + // get index from exp, + // mv_lut_base get exp as index, remove mantissa + cvk_tdma_l2l_tensor_copy_param_t p10; + p10.dst = tl_buf3; + p10.src = tl_ifmap; + p10.mv_lut_base = false; + p10.mv_lut_idx = true; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = false; + + // invert = table_0_102(x) ([0-1] return 1, >1 return -1) + p12.ofmap = tl_buf3; + p12.ifmap = tl_buf3; + p12.table = tl_invert_buf; + ctx->ops->tiu_lookup_table(ctx, &p12); + + // z = invert * z + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_ofmap_bf16; + p1.b_is_const = 0; + p1.b = tl_buf3; + p1.rshift_bits = 0; + p1.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p1); + + // t = 51 * (invert + 1), -> invert + 1 + cvk_tiu_add_param_t p4; + p4.res_high = 0; + p4.res_low = tl_buf3; + p4.a_high = 0; + p4.a_low = tl_buf3; + p4.b_is_const = 1; + p4.b.high = 0; + p4.b_const.val = convert_fp32_bf16(1.0); + p4.rshift_bits = 0; + p4.relu_enable = 0; + + ctx->ops->tiu_add(ctx, &p4); + + // t = 51 * (invert + 1) + p1.res_high = NULL; + p1.res_low = tl_buf3; + p1.a = tl_buf3; + p1.b_is_const = 1; + p1.b_const.val = convert_fp32_bf16(51.0); + p1.rshift_bits = 0; + p1.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p1); + +#if 1 + // avoid rounding, we first round org index + cvm_get_uint8_t_tbl_idx(ctx, tl_buf, tl_buf2); + tl_buf2->fmt = CVK_FMT_U8; + cvk_tl_shape_t t = tl_buf2->shape; + cvk_tl_stride_t s = tl_buf2->stride; + tl_buf2->shape.h = tl_buf2->shape.h * tl_buf2->shape.w; + tl_buf2->shape.w = 1; + tl_buf2->stride.h = 2; + tl_buf2->stride.c = tl_buf2->shape.h * tl_buf2->shape.w; + tl_buf2->stride.c = tl_buf2->shape.c * tl_buf2->shape.h * tl_buf2->shape.w; + p10.dst = tl_buf; + p10.src = tl_buf2; + p10.mv_lut_base = false; + p10.mv_lut_idx = false; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); + tl_buf2->fmt = CVK_FMT_BF16; + tl_buf2->shape = t; + tl_buf2->stride = s; +#else +#endif + // t = t + index + p4.res_high = 0; + p4.res_low = tl_buf3; + p4.a_high = 0; + p4.a_low = tl_buf3; + p4.b_is_const = 0; + p4.b.high = 0; + p4.b.low = tl_buf; + p4.rshift_bits = 0; + p4.relu_enable = 0; + + ctx->ops->tiu_add(ctx, &p4); + + // get int8 index for lut + cvm_get_uint8_t_tbl_idx(ctx, tl_buf3, tl_buf); + + // shift = y0(t) ([0-1] return LUT_d[index], >1 return M_PI_2 - LUT_d[index] + p12.ofmap = tl_buf3; + p12.ifmap = tl_buf; + p12.table = tl_y0_buf; + ctx->ops->tiu_lookup_table(ctx, &p12); + + // z = base[x2] + (0.x * (slope[x2]) + p4.res_high = 0; + p4.res_low = tl_buf2; + p4.a_high = 0; + p4.a_low = tl_ofmap_bf16; + p4.b_is_const = 0; + p4.b.high = 0; + p4.b.low = tl_buf3; + p4.rshift_bits = 0; + p4.relu_enable = 0; + + ctx->ops->tiu_add(ctx, &p4); + + // get pos neg, use mv_lut_idx + p10.dst = tl_buf3; + p10.src = tl_ifmap; + p10.mv_lut_base = false; + p10.mv_lut_idx = true; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = false; + + // p = table_pos_neg(x0) (>= 0 return 1, < 0 return -1) + p12.ofmap = tl_buf3; + p12.ifmap = tl_buf3; + p12.table = tl_pos_neg_buf; + ctx->ops->tiu_lookup_table(ctx, &p12); +#if 0 + // p * z + p1.res_high = NULL; + p1.res_low = tl_ofmap_bf16; + p1.a = tl_buf2; + p1.b_is_const = 0; + p1.b = tl_buf3; + p1.rshift_bits = 0; + p1.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p1); +#else + + // add pi/-pi for atan2 + cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 0.0); + cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, b); + + // p * z + pi + cvk_tiu_mac_param_t p2; + p2.res_high = 0; + p2.res_low = tl_ofmap_bf16; + p2.res_is_int8 = 0; + p2.a = tl_buf2; + p2.b_is_const = 0; + p2.b = tl_buf3; + p2.lshift_bits = 0; // lshift_bits; + p2.rshift_bits = 0; // rshift_bits; + p2.relu_enable = 0; + + ctx->ops->tiu_mac(ctx, &p2); +#endif + + return 0; +} + +int cvm_atan_emit(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2, + cvk_tl_t* tl_buf3, cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_slope_buf, + cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_buf, cvk_tl_t* tl_table_answer, + cvk_tl_t* tl_table_answer_mantissa, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + return _cvm_atan_emit(ctx, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, tl_slope_buf, + tl_invert_buf, tl_pos_neg_buf, tl_table_answer, tl_table_answer_mantissa, + tl_ofmap_bf16, fmt, 0.0); +} + +/** + * \table_data_atan_slope is optional, NULL for not assign it + */ +void cvm_atan_tbl(uint16_t* table_data_atan_y0, uint16_t* table_data_atan_slope, + uint16_t* table_data_atan_invert, uint16_t* table_data_atan_pos_neg, + cvk_tl_shape_t* table_shape) { + ASSERT(table_data_atan_y0); + // ASSERT(table_data_atan_slope); + ASSERT(table_data_atan_invert); + ASSERT(table_data_atan_pos_neg); + ASSERT(table_shape); + + cvm_atan_y0(table_data_atan_y0, table_shape); + if (table_data_atan_slope) { + cvm_atan_slope(table_data_atan_slope, table_shape); + } + cvm_atan_s_01(table_data_atan_invert, table_shape); + cvm_pos_neg_tbl(table_data_atan_pos_neg, table_shape); +} + +void cvm_atan_fast_degree_tbl(uint16_t* table_data_atan_y0, uint16_t* table_data_atan_invert, + uint16_t* table_data_atan_pos_neg, cvk_tl_shape_t* table_shape) { + ASSERT(table_data_atan_y0); + ASSERT(table_data_atan_invert); + ASSERT(table_data_atan_pos_neg); + ASSERT(table_shape); + + cvm_atan_fast_degree_y0(table_data_atan_y0, table_shape); + cvm_atan_s_01(table_data_atan_invert, table_shape); + cvm_pos_neg_tbl(table_data_atan_pos_neg, table_shape); +} + +/** issue atan >= 0 + * for fast version, we discard slope + * tl_y0_buf[0-102) put 'LUT[index]', [102-204) for 'M_PI_2 - LUT[index]' + */ +int _cvm_atan_fast_emit(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2, + cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_buf, + cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, float b, bool is_dirty_ifmap) { + cvm_table_check(tl_ifmap, tl_y0_buf, tl_y0_buf, tl_ifmap); + cvm_table_check(tl_buf, tl_invert_buf, tl_pos_neg_buf, tl_buf); + cvm_table_check(tl_buf, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16); + + cvk_tiu_lookup_table_param_t p12; + cvk_tdma_l2l_tensor_copy_param_t p10; + + // plz refer https://github.com/xiezhq-hermann/atan_lookup/blob/master/atan.cpp + // for faster version + cvm_emit_abs(ctx, tl_ifmap, tl_buf, fmt); + + // y = 1 / x + _cvm_lut_exp_mantissa(ctx, tl_buf, NULL, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16, + true); + + // once again cuz recipical's input dirtied + cvm_emit_abs(ctx, tl_ifmap, tl_buf, fmt); + + cvk_tiu_min_param_t p7; + p7.min = tl_buf; + p7.a = tl_buf; + p7.b_is_const = 0; + p7.b = tl_ofmap_bf16; + + ctx->ops->tiu_min(ctx, &p7); + + // get index + cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, 100.0); + + // get index from exp, + // mv_lut_base get exp as index, remove mantissa + p10.dst = tl_ofmap_bf16; + p10.src = tl_ifmap; + p10.mv_lut_base = false; + p10.mv_lut_idx = true; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = false; + + cvk_tl_t* tmp = tl_buf2; + if (is_dirty_ifmap) { + tmp = tl_ifmap; + } + + // get pos neg, use mv_lut_idx + p10.dst = tmp; + p10.src = tl_ifmap; + p10.mv_lut_base = false; + p10.mv_lut_idx = true; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = false; + + // p = table_pos_neg(x0) (>= 0 return 1, < 0 return -1) + p12.ofmap = tmp; + p12.ifmap = tmp; + p12.table = tl_pos_neg_buf; + ctx->ops->tiu_lookup_table(ctx, &p12); + + // get index of LUT[index] or (M_PI_2 - LUT[index]) + { + // invert = table_0_102(x) ([0-1] return 1, >1 return -1) + p12.ofmap = tl_ofmap_bf16; + p12.ifmap = tl_ofmap_bf16; + p12.table = tl_invert_buf; + ctx->ops->tiu_lookup_table(ctx, &p12); + + cvk_tl_t* out = tl_buf; +#if 1 + // t = 51 * (invert + 1), -> invert + 1 + cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0); + + // t = 51 * (invert + 1) + cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 51.0); + + // t = t + index + cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // get int8 index for lut + // cvm_get_uint8_t_tbl_idx(ctx, tl_ofmap_bf16, tl_buf); + //_cvm_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, CVK_FMT_U8, 0); + + //// shift = y0(t) ([0-1] return LUT_d[index], >1 return M_PI_2 - LUT_d[index] + // p12.ofmap = tl_buf; + // p12.ifmap = tl_buf; + // p12.table = tl_y0_buf; + // ctx->ops->tiu_lookup_table(ctx, &p12); + + _cvm_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, CVK_FMT_U8, 0); + +#else + // index, output is uint8 format + _cvm_get_tbl_idx(ctx, tl_buf, tl_buf, CVK_FMT_U8, 0); + + // mask value from bf16 -> int8, we add as bf16 + // int8 format (51*(mask + 1) + index) is real remap index for table + // mask = mask + 1 + cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0); + + // mask = 51 * mask + cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 51.0); + + // mask value change to int8 format for lut + _cvm_get_tbl_idx(ctx, tl_ofmap_bf16, tl_ofmap_bf16, CVK_FMT_U8, 0); + + // int8 format (51*(mask) + index) is real remap index for table + if (1) { + cvk_tl_t index_uint8_t, mask_uint8_t, fake_uint8_t, out_uint8_t; + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &index_uint8_t, tl_buf, CVK_FMT_U8); + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &out_uint8_t, tl_buf, CVK_FMT_U8); + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &mask_uint8_t, tl_ofmap_bf16, CVK_FMT_U8); + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &fake_uint8_t, tmp, CVK_FMT_U8); + // fake_uint8_t.start_address = + // ctx->info.lmem_size - (fake_uint8_t.shape.h * fake_uint8_t.shape.w); + // //tl_buf->start_address + 1; + // //tl_ifmap->start_address; + + // mask + index + // its safe we only need low part value, so we give fake high part + + cvk_tl_t* a = bmk1880v2_lmem_alloc_tensor(ctx, out_uint8_t.shape, CVK_FMT_U8, CTRL_NULL); +#if 1 + cvk_tiu_add_param_t p4; + p4.res_high = 0; + // p4.res_low = &mask_uint8_t; + p4.res_low = &index_uint8_t; + p4.a_high = a; + p4.a_low = &index_uint8_t; + p4.b_is_const = 0; + p4.b.high = a; + p4.b.low = &mask_uint8_t; + p4.rshift_bits = 0; + p4.relu_enable = 0; + + ctx->ops->tiu_add(ctx, &p4); + // out = tl_ofmap_bf16; +#else + { + cvk_tiu_mul_param_t p; + p.res_high = NULL; + p.res_low = a; + p.a = a; + p.b_is_const = 1; + p.b_const.val = 0; + p.b_const.is_signed = 0; + p.rshift_bits = 0; + p.relu_enable = 0; + ctx->ops->tiu_mul(ctx, &p); + } + + out = tl_ofmap_bf16; + cvk_tiu_mac_param_t p2; + p2.res_high = a; + p2.res_low = &mask_uint8_t; + p2.res_is_int8 = 0; + p2.a = &index_uint8_t; + p2.b_is_const = 1; + p2.b = 0; + p2.b_const.val = 1; + p2.b_const.is_signed = 0; + p2.lshift_bits = 0; + p2.rshift_bits = 0; + p2.relu_enable = 0; + ctx->ops->tiu_mac(ctx, &p2); +#endif + bmk1880v2_lmem_free_tensor(ctx, a); + } else { + // move bak to bf16 + // cvk_tl_t index_uint8_t, mask_uint8_t; + // bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &index_uint8_t, tl_buf, CVK_FMT_U8); + // bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &mask_uint8_t, tl_ofmap_bf16, CVK_FMT_U8); + + // p10.dst = tl_buf; + // p10.src = &index_uint8_t; + // p10.mv_lut_base = false; + // p10.mv_lut_idx = false; + // ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); + + // p10.dst = tl_ofmap_bf16; + // p10.src = &mask_uint8_t; + // ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); + + // cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + //_cvm_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, CVK_FMT_U8, 0); + } +#endif + + // shift = y0(t) ([0-1] return LUT_d[index], >1 return M_PI_2 - LUT_d[index] + p12.ofmap = out; + p12.ifmap = out; + p12.table = tl_y0_buf; + ctx->ops->tiu_lookup_table(ctx, &p12); + } + +#if 0 + // add pi/-pi for atan2 + cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 0.0); + cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, b); + + // p * z + pi + cvk_tiu_mac_param_t p2; + p2.res_high = 0; + p2.res_low = tl_ofmap_bf16; + p2.res_is_int8 = 0; + p2.a = tl_buf; + p2.b_is_const = 0; + p2.b = tmp; + p2.lshift_bits = 0;//lshift_bits; + p2.rshift_bits = 0;//rshift_bits; + p2.relu_enable = 0; + + ctx->ops->tiu_mac(ctx, &p2); +#else + cvm_emit_mul(ctx, tl_buf, tmp, tl_ofmap_bf16, fmt); + cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, b); +#endif + + return 0; +} + +/** + * \brief using \tl_buf2 as temp buffer for uint8_t add + * \NOTICE: it dirties input: \tl_ifmap + */ +int __cvm_atan_fast_emit(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf, + cvk_tl_t* tl_buf2, cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_invert_buf, + cvk_tl_t* tl_pos_neg_buf, cvk_tl_t* tl_table_answer, + cvk_tl_t* tl_table_answer_mantissa, cvk_tl_t* OUT tl_ofmap_bf16, + cvk_fmt_t fmt) { + cvm_table_check(tl_ifmap, tl_y0_buf, tl_y0_buf, tl_ifmap); + cvm_table_check(tl_buf, tl_invert_buf, tl_pos_neg_buf, tl_buf); + cvm_table_check(tl_buf, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16); + + cvk_tiu_lookup_table_param_t p12; + + // plz refer https://github.com/xiezhq-hermann/atan_lookup/blob/master/atan.cpp + // for faster version + cvm_emit_abs(ctx, tl_ifmap, tl_buf, fmt); + + // y = 1 / x + _cvm_lut_exp_mantissa(ctx, tl_buf, NULL, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16, + true); + + // once again cuz recipical's input dirtied + cvm_emit_abs(ctx, tl_ifmap, tl_buf, fmt); + + cvk_tiu_min_param_t p7; + p7.min = tl_buf; + p7.a = tl_buf; + p7.b_is_const = 0; + p7.b = tl_ofmap_bf16; + + ctx->ops->tiu_min(ctx, &p7); + + // get index + cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, 100.0); + + // get index from exp, + // mv_lut_base get exp as index, remove mantissa +#if 1 + cvk_tdma_l2l_tensor_copy_param_t p10; + p10.dst = tl_ofmap_bf16; + p10.src = tl_ifmap; + p10.mv_lut_base = false; + p10.mv_lut_idx = true; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = false; +#else + cvm_emit_abs(ctx, tl_ifmap, tl_ofmap_bf16, fmt); + cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 0.5); +#endif + + cvk_tl_t* tmp = tl_buf2; + tmp = tl_ifmap; + +#if 0 + // get pos neg, use mv_lut_idx + p10.dst = tmp; + p10.src = tl_ifmap; + p10.mv_lut_base = false; + p10.mv_lut_idx = true; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); + p10.mv_lut_idx = false; + + // p = table_pos_neg(x0) (>= 0 return 1, < 0 return -1) + p12.ofmap = tmp; + p12.ifmap = tmp; + p12.table = tl_pos_neg_buf; + ctx->ops->tiu_lookup_table(ctx, &p12); + //p12.ofmap = tl_ofmap_bf16; + //p12.ifmap = tmp; + //p12.table = tl_pos_neg_buf; + //ctx->ops->tiu_lookup_table(ctx, &p12); + //return 0; +#else + // dirty input is ok + cvk_tl_t index_i8; + bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_buf2, CVK_FMT_I8); + cvm_emit_mask_ge0_lt0(ctx, tmp, &index_i8, tmp, fmt); + // cvm_emit_mask_ge0_lt0(ctx, tmp, &index_i8, tl_ofmap_bf16, fmt); + // return 0; +#endif + + // get index of LUT[index] or (M_PI_2 - LUT[index]) + { +#if 1 + // invert = table_0_102(x) ([0-1] return 1, >1 return -1) + p12.ofmap = tl_ofmap_bf16; + p12.ifmap = tl_ofmap_bf16; + p12.table = tl_invert_buf; + ctx->ops->tiu_lookup_table(ctx, &p12); +#else + { + cvk_tl_t index_i8; + bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_buf2, CVK_FMT_I8); + // 1. abs + // 2. add 0.5 to round bf16->int8 + // 3. leave (0,1) and others, rightshift 1 to get 0, others + // 4. saturate to int max, and transform from int8 to bf16 + + // cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, convert_bf16_fp32(0x7f00)); + cvk_tdma_l2l_tensor_copy_param_t p1; + p1.src = tl_ofmap_bf16; + p1.dst = &index_i8; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); + + cvk_tiu_mul_param_t p; + p.res_high = NULL; + p.res_low = &index_i8; + p.a = &index_i8; + p.b_is_const = 1; + p.b_const.val = 1; + p.b_const.is_signed = 0; + p.rshift_bits = 1; + p.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p); + + // p.res_high = NULL; + // p.res_low = &index_i8; + // p.a = &index_i8; + // p.b_is_const = 1; + // p.b_const.val =-1; + // p.b_const.is_signed = 1; + // p.rshift_bits = 7; + // p.relu_enable = 0; + + // ctx->ops->tiu_mul(ctx, &p); + + p1.src = &index_i8; + p1.dst = tl_ofmap_bf16; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); + return 0; + + // cvm_emit_mask_eq_0(ctx, tl_ofmap_bf16, tl_ofmap_bf16, &index_i8, tl_ofmap_bf16, fmt); + cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 2.0); + cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0); + } +#endif + + cvk_tl_t* out = tl_buf; +#if 0 + // t = 51 * (invert + 1), -> invert + 1 + cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0); + + // t = 51 * (invert + 1) + cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 51.0); + + // t = t + index + cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // get int8 index for lut + //bf16_get_uint8_t_tbl_idx(ctx, tl_ofmap_bf16, tl_buf); + //_cvm_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, CVK_FMT_U8, 0); + + //// shift = y0(t) ([0-1] return LUT_d[index], >1 return M_PI_2 - LUT_d[index] + //p12.ofmap = tl_buf; + //p12.ifmap = tl_buf; + //p12.table = tl_y0_buf; + //ctx->ops->tiu_lookup_table(ctx, &p12); + + _cvm_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, CVK_FMT_U8, 0); + +#else + // index, output is uint8 format + _cvm_get_tbl_idx(ctx, tl_buf, tl_buf, CVK_FMT_U8, 0); + + // mask value from bf16 -> int8, we add as bf16 + // int8 format (51*(mask + 1) + index) is real remap index for table + // mask = mask + 1 + cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0); + + // mask = 51 * mask + cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 51.0); + + // mask value change to int8 format for lut + _cvm_get_tbl_idx(ctx, tl_ofmap_bf16, tl_ofmap_bf16, CVK_FMT_U8, 0); + + // int8 format (51*(mask) + index) is real remap index for table + if (1) { + cvk_tl_t index_uint8_t, mask_uint8_t, fake_uint8_t, out_uint8_t; + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &index_uint8_t, tl_buf, CVK_FMT_U8); + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &out_uint8_t, tl_buf, CVK_FMT_U8); + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &mask_uint8_t, tl_ofmap_bf16, CVK_FMT_U8); + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &fake_uint8_t, tl_buf2, CVK_FMT_U8); +#if 0 + // mask + index + // its safe we only need low part value, so we give fake high part + cvk_tiu_add_param_t p4; + p4.res_high = 0; + p4.res_low = &index_uint8_t; + p4.a_high = &fake_uint8_t; + p4.a_low = &index_uint8_t; + p4.b_is_const = 0; + p4.b.high = &fake_uint8_t; + p4.b.low = &mask_uint8_t; + p4.rshift_bits = 0; + p4.relu_enable = 0; + + ctx->ops->tiu_add(ctx, &p4); +#else + cvk_tiu_mac_param_t p2; + p2.res_high = &fake_uint8_t; + p2.res_low = &index_uint8_t; + p2.res_is_int8 = 0; + p2.a = &mask_uint8_t; + p2.b_is_const = 1; + p2.b = 0; + p2.b_const.val = 1; + p2.b_const.is_signed = 0; + p2.lshift_bits = 0; + p2.rshift_bits = 0; + p2.relu_enable = 0; + + ctx->ops->tiu_mac(ctx, &p2); +#endif + + } else { + // move bak to bf16 + // cvk_tl_t index_uint8_t, mask_uint8_t; + // bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &index_uint8_t, tl_buf, CVK_FMT_U8); + // bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &mask_uint8_t, tl_ofmap_bf16, CVK_FMT_U8); + + // p10.dst = tl_buf; + // p10.src = &index_uint8_t; + // p10.mv_lut_base = false; + // p10.mv_lut_idx = false; + // ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); + + // p10.dst = tl_ofmap_bf16; + // p10.src = &mask_uint8_t; + // ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); + + // cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + //_cvm_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, CVK_FMT_U8, 0); + } +#endif + + // shift = y0(t) ([0-1] return LUT_d[index], >1 return M_PI_2 - LUT_d[index] + p12.ofmap = out; + p12.ifmap = out; + p12.table = tl_y0_buf; + ctx->ops->tiu_lookup_table(ctx, &p12); + } + + cvm_emit_mul(ctx, tl_buf, tmp, tl_ofmap_bf16, fmt); + + return 0; +} + +int cvm_atan_fast_emit(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2, + cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_buf, + cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, bool is_dirty_ifmap) { + return _cvm_atan_fast_emit(ctx, tl_ifmap, tl_buf, tl_buf2, tl_y0_buf, tl_invert_buf, + tl_pos_neg_buf, tl_table_answer, tl_table_answer_mantissa, + tl_ofmap_bf16, fmt, 0.0, is_dirty_ifmap); +} diff --git a/cvimath/src/tiu_lut_atan2.c b/cvimath/src/tiu_lut_atan2.c new file mode 100644 index 000000000..36a6e422f --- /dev/null +++ b/cvimath/src/tiu_lut_atan2.c @@ -0,0 +1,787 @@ +/** + * \brirf implement with atan, plz refer https://en.wikipedia.org/wiki/Atan2 + * NOTICE: current epsilon set to 0.1 + */ +#include +#include "gen_lut.h" // NOLINT + +//#define DBG + +static void _cvm_atan2_emit_case_3(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf, + cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_buf4, + cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_slope_buf, + cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table, + cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, float b) { + // case 3 + // atan( y / x) + + // x0 = reciprocal(x) + cvm_emit_reciprocal(ctx, x, tl_buf2, tl_table_answer, tl_table_answer_mantissa, tl_buf); + + // y0 = x0 * y + cvk_tiu_mul_param_t p1; + p1.res_high = NULL; + p1.res_low = tl_buf4; + p1.a = y; + p1.b_is_const = 0; + p1.b = tl_buf; + p1.rshift_bits = 0; + p1.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p1); + + // x0 = atan(y0) + _cvm_atan_emit(ctx, tl_buf4, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, tl_slope_buf, tl_invert_buf, + tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa, OUT tl_ofmap_bf16, + fmt, b); +} + +static void cvm_atan2_emit_case_3(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf, + cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_buf4, + cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_slope_buf, + cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table, + cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + _cvm_atan2_emit_case_3(ctx, y, x, tl_buf, tl_buf2, tl_buf3, tl_buf4, tl_y0_buf, tl_slope_buf, + tl_invert_buf, tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa, + tl_ofmap_bf16, fmt, 0.0); +} + +// NOTICE: it could dirty \y +/** + * atan2(y, x) should express 4 condition using atan express from + * [here](https://en.wikipedia.org/wiki/Atan2) + */ +void cvm_atan2_emit(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf, + cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_buf4, cvk_tl_t* tl_buf5, + cvk_tl_t* tl_buf6, cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_slope_buf, + cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table, cvk_tl_t* tl_table_answer, + cvk_tl_t* tl_table_answer_mantissa, cvk_tl_t* tl_sqrt_table_answer, + cvk_tl_t* tl_sqrt_table_answer_mantissa, cvk_tl_t* tl_0_idx_table, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + cvm_table_check(y, tl_y0_buf, tl_slope_buf, x); + cvm_table_check(tl_buf, tl_invert_buf, tl_pos_neg_table, tl_buf2); + cvm_table_check(tl_buf3, tl_table_answer, tl_table_answer_mantissa, tl_buf4); + cvm_table_check(tl_buf6, tl_table_answer, tl_0_idx_table, tl_buf5); + cvm_table_check(y, tl_sqrt_table_answer, tl_sqrt_table_answer_mantissa, x); + + // atan(y/x), x > 0 + // atan(y/x) + PI , x < 0 and y >= 0 + // atan(y/x) - PI , x < 0 and y < 0 + // pi / 2, x = 0 and y > 0 + // -pi / 2, x = 0 and y < 0 + // 0, x = 0 and y = 0 + + // atan(y/x), x > 0 + cvm_emit_max_const(ctx, x, tl_buf4, fmt, 0.0); + cvm_atan2_emit_case_3(ctx, y, tl_buf4, tl_buf, tl_buf2, tl_buf3, tl_buf5, tl_y0_buf, tl_slope_buf, + tl_invert_buf, tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa, + tl_ofmap_bf16, fmt); + + // x > 0 + cvm_emit_mask_gt0(ctx, x, tl_buf, tl_buf3, tl_buf4, tl_pos_neg_table, tl_0_idx_table, tl_buf2, + fmt); + + cvm_emit_mul(ctx, tl_ofmap_bf16, tl_buf2, tl_ofmap_bf16, fmt); + + // atan(y/x) + PI , x < 0 and y >= 0 + cvm_emit_min_const(ctx, x, tl_buf4, fmt, 0.0); + _cvm_atan2_emit_case_3(ctx, y, tl_buf4, tl_buf, tl_buf2, tl_buf3, tl_buf5, tl_y0_buf, + tl_slope_buf, tl_invert_buf, tl_pos_neg_table, tl_table_answer, + tl_table_answer_mantissa, tl_buf6, fmt, M_PI); + // cvm_emit_add_const(ctx, tl_buf6, tl_buf6, fmt, M_PI); + + // get index map that x < 0 and y >= 0 + // !(y >= 0) = !(y < 0) +#if 0 + cvm_emit_pos_idx(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt); + // y == 0 + cvm_emit_0_idx(ctx, y, tl_buf, tl_0_idx_table, tl_buf2, fmt); + cvm_emit_add(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); +#else + // y >= 0 + cvm_emit_mask_ge0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf2, fmt); +#endif + // x < 0 + cvm_emit_mask_lt0(ctx, x, tl_buf, tl_pos_neg_table, tl_buf3, fmt); + // x < 0 && y >= 0 + cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + + cvm_emit_mul(ctx, tl_buf6, tl_buf2, tl_buf, fmt); + cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // atan(y/x) - PI , x < 0 and y < 0 + cvm_emit_min_const(ctx, x, tl_buf4, fmt, 0.0); + cvm_atan2_emit_case_3(ctx, y, tl_buf4, tl_buf, tl_buf2, tl_buf3, tl_buf5, tl_y0_buf, tl_slope_buf, + tl_invert_buf, tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa, + tl_buf6, fmt); + cvm_emit_add_const(ctx, tl_buf6, tl_buf6, fmt, -1.0 * M_PI); + // x < 0 and y < 0 + + // we leverage x <= 0 and y <= 0 cuz we filter out x = 0 case, speed up it + // x < 0 + cvm_emit_mask_lt0(ctx, x, tl_buf, tl_pos_neg_table, tl_buf2, fmt); + // y < 0 + cvm_emit_mask_lt0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt); + // x < 0 && y < 0 + cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + + cvm_emit_mul(ctx, tl_buf6, tl_buf2, tl_buf, fmt); + cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // pi / 2, x = 0 and y > 0 + // x = 0 + cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1 + + // y > 0 + cvm_emit_mask_gt0(ctx, y, tl_buf, tl_buf5, tl_buf4, tl_pos_neg_table, tl_0_idx_table, tl_buf3, + fmt); + // x = 0 && y > 0 + cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, fmt, M_PI / 2.0); + + cvm_emit_add(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // -pi / 2, x = 0 and y < 0 + // x = 0 + cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1 + // y < 0 + cvm_emit_mask_lt0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt); + // x = 0 && y < 0 + cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + + cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, fmt, -1.0 * M_PI / 2.0); + + cvm_emit_add(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // 0, x = 0 and y = 0 + // x = 0 + cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1 + // y = 0 + cvm_emit_mask_eq0(ctx, y, tl_buf, tl_0_idx_table, tl_buf3, fmt); // 0.003 could consider 1 + + // x = 0 && y = 0 + cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf3, fmt); + + // !(x = 0 and y = 0) keep it + cvm_emit_0_1_revert_input(ctx, tl_buf3, tl_buf, tl_buf2, fmt); + cvm_emit_mul(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt); +} + +// ==== fast version === +static void __cvm_atan2_fast_emit_case_3(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, + cvk_tl_t* tl_buf, cvk_tl_t* tl_y0_buf, + cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table, + cvk_tl_t* tl_table_answer, + cvk_tl_t* tl_table_answer_mantissa, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_tl_t* OUT y_over_x, + cvk_fmt_t fmt, float b) { + // case 3 + // atan( y / x) + +#if 0 + // x0 = reciprocal(x) + _cvm_lut_exp_mantissa(ctx, + x, + NULL, + tl_table_answer, + tl_table_answer_mantissa, + tl_buf, + true + ); + + // y0 = x0 * y + cvm_emit_mul(ctx, y, tl_buf, tl_buf, fmt); +#else + cvm_emit_x_over_y(ctx, y, x, NULL, tl_buf, tl_table_answer, tl_table_answer_mantissa, fmt, true); + + if (y_over_x) { + cvm_emit_add_const(ctx, tl_buf, y_over_x, fmt, 0); + } +#endif + + // x0 = atan(y0) + _cvm_atan_fast_emit(ctx, tl_buf, x, NULL, tl_y0_buf, tl_invert_buf, tl_pos_neg_table, + tl_table_answer, tl_table_answer_mantissa, OUT tl_ofmap_bf16, fmt, b, true); +} + +#if 0 +static void _cvm_atan2_fast_emit(cvk_context_t *ctx, + cvk_tl_t* y, + cvk_tl_t* tl_buf, + cvk_tl_t* tl_buf2, + cvk_tl_t* tl_buf4, + cvk_tl_t* tl_y0_buf, + cvk_tl_t* tl_invert_buf, + cvk_tl_t* tl_pos_neg_table, + cvk_tl_t* tl_table_answer, + cvk_tl_t* tl_table_answer_mantissa, + cvk_tl_t* OUT tl_ofmap_bf16, + cvk_tl_t* OUT tl_buf3, + cvk_fmt_t fmt) { + // case 3 + // atan( y / x) + +#if 0 + // x0 = reciprocal(tl_buf) + _cvm_lut_exp_mantissa(ctx, + tl_buf, + NULL, + tl_table_answer, + tl_table_answer_mantissa, + tl_buf2, + true + ); + + // y0 = x0 * y + cvm_emit_mul(ctx, y, tl_buf2, tl_buf2, fmt); +#else +#if 0 + cvm_emit_x_over_y(ctx, y, tl_buf, NULL, tl_buf2, + tl_table_answer, tl_table_answer_mantissa, fmt, true); + + if (tl_buf3) { + bf16_emit_add_const(ctx, tl_buf2, tl_buf3, fmt, 0); + } +#else + //if (tl_buf3) { + // cvm_emit_add_const(ctx, tl_buf, tl_buf3, fmt, 0); + //} + + // get xy == 0 and y < 0, add pi + // using xy to depend x = 0 or y = 0 + // recipical y < 0 get 0xFEFF, y > 0 get 0x7F7F, + // 1. b = xy to get other/(x = 0 or y = 0) + // 2. c = b * 2^64 to saturate it + // 3. c(bf16) = c(int8) >> 10 to get 1/0 map, 1 indicate xy > 0 + // 4. c = c * -1 + 1 to invert map, 1 indicate x = 0 or y = 0 + // 5. d = b(int8) - 0x7f, 0 means y > 0 + // 6. d = d(int8) + 0xff to get inf + cvm_emit_mul(ctx, y, tl_buf, tl_buf2, fmt); + // get 7f7f / 0 + cvm_emit_mul_const(ctx, tl_buf2, tl_ofmap_bf16, fmt, convert_bf16_fp32(0x7f00)); + //// 1 = 0x3f80 + //bf16_emit_mul_const(ctx, tl_buf2, tl_ofmap_bf16, fmt, 0); + //bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_buf4, fmt, 1.0); + // bf16->uint8_t and back uint8_t->bf16 to get 0/1 map + +#if 1 + cvk_tl_t index_uint8_t; + bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &index_uint8_t, tl_buf2, CVK_FMT_U8); + + index_uint8_t.shape.w = index_uint8_t.shape.w / 2; + index_uint8_t.stride = ctx->ops->tl_default_stride(ctx, index_uint8_t.shape, + CTRL_NULL, CVK_FMT_I8); + + index_uint8_t.fmt = CVK_FMT_I8; + + cvk_tdma_l2l_tensor_copy_param_t p1; + p1.src = tl_ofmap_bf16; + p1.dst = &index_uint8_t; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); + cvk_tiu_mul_param_t p; + +#if 0 + + p.res_high = NULL; + p.res_low = &index_uint8_t; + p.a = &index_uint8_t; + p.b_is_const = 1; + p.b_const.val =-1; + p.b_const.is_signed = 1; + p.rshift_bits = 0; + p.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p); +#else + p.res_high = NULL; + p.res_low = &index_uint8_t; + p.a = &index_uint8_t; + p.b_is_const = 1; + p.b_const.val =-1; + p.b_const.is_signed = 1; + p.rshift_bits = 7; + p.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p); + +#endif + + // get -1/0 map, -1 indicate xy != 0 + p1.src = &index_uint8_t; + p1.dst = tl_ofmap_bf16; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); + + // x * (-1) + 1 get 0/1 map, 1 indicate xy == 0 + //bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, -1.0); + cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0); + + // get y < 0, bf16->int8 and mul 0xff to get -128 and righshift to get 1 + cvm_emit_mul_const(ctx, y, tl_buf3, fmt, pow(2,64)); + p1.src = tl_buf3; + p1.dst = &index_uint8_t; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); + + + p.res_high = 0; + p.res_low = &index_uint8_t; + p.a = &index_uint8_t; + p.b_is_const = 1; + p.b_const.val =-128; + p.b_const.is_signed = 1; + p.rshift_bits = 0; + p.relu_enable = 1; + + ctx->ops->tiu_mul(ctx, &p); + + p.res_high = 0; + p.res_low = &index_uint8_t; + p.a = &index_uint8_t; + p.b_is_const = 1; + p.b_const.val =1; + p.b_const.is_signed = 1; + p.rshift_bits = 7; + p.relu_enable = 1; + + ctx->ops->tiu_mul(ctx, &p); + + // get y < 0 + p1.src = &index_uint8_t; + p1.dst = tl_buf4; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); + cvm_emit_mul_const(ctx, tl_buf4, tl_buf4, fmt, -1.0); + + // get y > 0 + // y * (-1) + 1 get 0/1 map, 1 indicate xy == 0 + cvm_emit_add_const(ctx, tl_buf4, tl_buf2, fmt, 1.0); + cvm_emit_add(ctx, tl_buf2, tl_buf4, tl_buf2, fmt); + + // merge y > 0 && y < 0 && x == 0 + cvm_emit_mul(ctx, tl_ofmap_bf16, tl_buf2, tl_buf3, fmt); + //bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 0); + //bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_buf3, fmt, M_PI); + +#endif + + + cvm_emit_x_over_y(ctx, y, tl_buf, NULL, tl_buf2, + tl_table_answer, tl_table_answer_mantissa, fmt, true); +#endif +#endif + + // x0 = atan(y0) + __cvm_atan_fast_emit(ctx, + tl_buf2, + tl_buf, + tl_buf4, + tl_y0_buf, + tl_invert_buf, + tl_pos_neg_table, + tl_table_answer, + tl_table_answer_mantissa, + OUT tl_ofmap_bf16, + fmt); + + // abs tl_buf3 + // revert and mul to clean !(x == 0 && (y != 0) case + // add pi/2 + cvm_emit_mul_const(ctx, tl_buf3, tl_buf2, fmt, -1); + cvk_tiu_min_param_t p3; + p3.min = tl_buf2; + p3.a = tl_buf3; + p3.b_is_const = 0; + p3.b = tl_buf2; + + ctx->ops->tiu_min(ctx, &p3); + cvm_emit_add_const(ctx, tl_buf2, tl_buf2, fmt, 1.0); + cvm_emit_mul(ctx, tl_ofmap_bf16, tl_buf2, tl_ofmap_bf16, fmt); + + cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, M_PI_2); + cvm_emit_add(ctx, tl_buf3, tl_ofmap_bf16, tl_ofmap_bf16, fmt); +} +#endif + +static void _cvm_atan2_fast_emit_case_3(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, + cvk_tl_t* tl_buf, cvk_tl_t* tl_y0_buf, + cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table, + cvk_tl_t* tl_table_answer, + cvk_tl_t* tl_table_answer_mantissa, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, float b) { + // case 3 + // atan( y / x) + return __cvm_atan2_fast_emit_case_3(ctx, y, x, tl_buf, tl_y0_buf, tl_invert_buf, tl_pos_neg_table, + tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16, + NULL, fmt, b); +} + +void cvm_atan2_fast_emit(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf, + cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_buf4, + cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_slope_buf, cvk_tl_t* tl_invert_buf, + cvk_tl_t* tl_pos_neg_table, cvk_tl_t* tl_table_answer, + cvk_tl_t* tl_table_answer_mantissa, cvk_tl_t* tl_0_idx_table, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + cvm_table_check(y, tl_y0_buf, tl_slope_buf, x); + cvm_table_check(tl_buf, tl_invert_buf, tl_pos_neg_table, tl_buf2); + cvm_table_check(tl_buf3, tl_table_answer, tl_table_answer_mantissa, tl_buf4); + cvm_table_check(tl_buf4, tl_table_answer, tl_0_idx_table, tl_buf4); + + // atan(y/x), x > 0 + // atan(y/x) + PI , x < 0 and y >= 0 + // atan(y/x) - PI , x < 0 and y < 0 + // pi / 2, x = 0 and y > 0 + // -pi / 2, x = 0 and y < 0 + // 0, x = 0 and y = 0 + + // atan(y/x), x > 0 + cvm_emit_max_const(ctx, x, tl_buf, fmt, 0.0); + _cvm_atan2_fast_emit_case_3(ctx, y, tl_buf, tl_buf2, tl_y0_buf, tl_invert_buf, tl_pos_neg_table, + tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16, fmt, 0.0); + + // x > 0 + cvm_emit_mask_gt0(ctx, x, tl_buf, tl_buf2, tl_buf3, tl_pos_neg_table, tl_0_idx_table, tl_buf, + fmt); + + cvm_emit_mul(ctx, tl_ofmap_bf16, tl_buf, tl_ofmap_bf16, fmt); + + // atan(y/x) + PI , x < 0 and y >= 0 + cvm_emit_min_const(ctx, x, tl_buf, fmt, 0.0); + _cvm_atan2_fast_emit_case_3(ctx, y, tl_buf, tl_buf2, tl_y0_buf, tl_invert_buf, tl_pos_neg_table, + tl_table_answer, tl_table_answer_mantissa, tl_buf4, fmt, M_PI); + // cvm_emit_add_const(ctx, tl_buf4, tl_buf4, fmt, M_PI); + + // get index map that x < 0 and y >= 0 + // !(y >= 0) = !(y < 0) +#if 0 + cvm_emit_pos_idx(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt); + // y == 0 + cvm_emit_0_idx(ctx, y, tl_buf, tl_0_idx_table, tl_buf2, fmt); + cvm_emit_add(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); +#else + // y >= 0 + cvm_emit_mask_ge0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf2, fmt); +#endif + // x < 0 + cvm_emit_mask_lt0(ctx, x, tl_buf, tl_pos_neg_table, tl_buf3, fmt); + // x < 0 && y >= 0 + cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + + cvm_emit_mul(ctx, tl_buf4, tl_buf2, tl_buf, fmt); + cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // atan(y/x) - PI , x < 0 and y < 0 + cvm_emit_min_const(ctx, x, tl_buf, fmt, 0.0); + _cvm_atan2_fast_emit_case_3(ctx, y, tl_buf, tl_buf2, tl_y0_buf, tl_invert_buf, tl_pos_neg_table, + tl_table_answer, tl_table_answer_mantissa, tl_buf4, fmt, 0.0); + cvm_emit_add_const(ctx, tl_buf4, tl_buf4, fmt, -1.0 * M_PI); + // x < 0 and y < 0 + + // we leverage x <= 0 and y <= 0 cuz we filter out x = 0 case, speed up it + // x < 0 + cvm_emit_mask_lt0(ctx, x, tl_buf, tl_pos_neg_table, tl_buf2, fmt); + // y < 0 + cvm_emit_mask_lt0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt); + // x < 0 && y < 0 + cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + + cvm_emit_mul(ctx, tl_buf4, tl_buf2, tl_buf, fmt); + cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // pi / 2, x = 0 and y > 0 + // x = 0 + cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1 + + // y > 0 + // cvm_emit_mask_gt0(ctx, y, tl_buf, tl_buf5, tl_buf4, tl_pos_neg_table, tl_0_idx_table, tl_buf3, + // fmt); + _cvm_emit_mask(ctx, y, tl_buf, tl_buf4, NULL, tl_pos_neg_table, tl_0_idx_table, tl_buf3, fmt, + CVM_MASK_TYPE_GT_0, true); + // x = 0 && y > 0 + cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, fmt, M_PI / 2.0); + + cvm_emit_add(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // -pi / 2, x = 0 and y < 0 + // x = 0 + cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1 + // y < 0 + cvm_emit_mask_lt0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt); + // x = 0 && y < 0 + cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + + cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, fmt, -1.0 * M_PI / 2.0); + + cvm_emit_add(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // 0, x = 0 and y = 0 + // x = 0 + cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt); // 0.003 could consider 1 + // y = 0 + cvm_emit_mask_eq0(ctx, y, tl_buf, tl_0_idx_table, tl_buf3, fmt); // 0.003 could consider 1 + + // x = 0 && y = 0 + cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf3, fmt); + + // !(x = 0 and y = 0) keep it + cvm_emit_0_1_revert_input(ctx, tl_buf3, tl_buf, tl_buf2, fmt); + cvm_emit_mul(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt); +} + +static void _x_lt_0(cvk_context_t* ctx, cvk_tl_t* x, cvk_tl_t* tl_buf, cvk_tl_t* index_i8, + cvk_fmt_t fmt, cvk_tl_t* OUT tl_buf2) { + cvk_tiu_min_param_t p7; + cvk_tiu_mul_param_t p; + cvk_tdma_l2l_tensor_copy_param_t p1; + + // x < 0 + p7.min = tl_buf; + p7.a = x; + p7.b_is_const = 1; + p7.b_const.val = 0; + p7.b_const.is_signed = 1; + + ctx->ops->tiu_min(ctx, &p7); + cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, pow(2, 64)); + + p1.src = tl_buf; + p1.dst = index_i8; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); + + p.res_high = 0; + p.res_low = index_i8; + p.a = index_i8; + p.b_is_const = 1; + p.b_const.val = -128; + p.b_const.is_signed = 1; + p.rshift_bits = 0; + p.relu_enable = 1; + + ctx->ops->tiu_mul(ctx, &p); + + p.res_high = 0; + p.res_low = index_i8; + p.a = index_i8; + p.b_is_const = 1; + p.b_const.val = 1; + p.b_const.is_signed = 1; + p.rshift_bits = 7; + p.relu_enable = 1; + + ctx->ops->tiu_mul(ctx, &p); + + // get x < 0 + p1.src = index_i8; + p1.dst = tl_buf2; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); +} + +static void _cvm_atan2_merge_emit(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf, + cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_y0_buf, + cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table, + cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, float degree_factor) { + cvm_table_check(y, tl_y0_buf, tl_invert_buf, x); + cvm_table_check(tl_buf, tl_invert_buf, tl_pos_neg_table, tl_buf2); + cvm_table_check(tl_buf3, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16); + + cvk_tl_t index_i8; + bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_buf2, CVK_FMT_I8); + + /** + * step 1. atan(y/x) + */ + cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, 0.0); + cvm_emit_add(ctx, x, tl_buf, tl_buf, fmt); + +#if 0 + // get y < 0, bf16->int8 and mul 0xff to get -128 and righshift to get 1 + cvk_tiu_mul_param_t p; + cvk_tdma_l2l_tensor_copy_param_t p1; + cvm_emit_mul_const(ctx, y, tl_buf3, fmt, pow(2,64)); + p1.src = tl_buf3; + p1.dst = &index_i8; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); + + + p.res_high = 0; + p.res_low = &index_i8; + p.a = &index_i8; + p.b_is_const = 1; + p.b_const.val =-128; + p.b_const.is_signed = 1; + p.rshift_bits = 0; + p.relu_enable = 1; + + ctx->ops->tiu_mul(ctx, &p); + + p.res_high = 0; + p.res_low = &index_i8; + p.a = &index_i8; + p.b_is_const = 1; + p.b_const.val =1; + p.b_const.is_signed = 1; + p.rshift_bits = 7; + p.relu_enable = 1; + + ctx->ops->tiu_mul(ctx, &p); + + // get y < 0 + p1.src = &index_i8; + p1.dst = tl_buf3; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); + cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -1.0); + + // get y > 0 + // y * (-1) + 1 get 0/1 map, 1 indicate xy == 0 + cvm_emit_add_const(ctx, tl_buf3, tl_buf2, fmt, 1.0); + + // reduce y == 0 + if (0) + { + cvk_tiu_max_param_t p3; + cvk_tl_t index_i8; + bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_ofmap_bf16, CVK_FMT_I8); + cvm_emit_mul_const(ctx, y, tl_buf, fmt, -1); + p3.max = tl_buf; + p3.a = y; + p3.b_is_const = 0; + p3.b = tl_buf; + + ctx->ops->tiu_max(ctx, &p3); + cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, convert_bf16_fp32(0x7f00)); + //bf16_emit_mul_const(ctx, tl_buf, tl_buf, fmt, pow(2, 64)); + + p1.src = tl_buf; + p1.dst = &index_i8; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); + + p.res_high = NULL; + p.res_low = &index_i8; + p.a = &index_i8; + p.b_is_const = 1; + p.b_const.val =-1; + p.b_const.is_signed = 1; + p.rshift_bits = 7; + p.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p); + + + p1.src = &index_i8; + p1.dst = tl_buf3; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1); + + //revert it + cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -1.0); + //bf16_emit_add_const(ctx, tl_buf3, tl_buf3, fmt, 1); + cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt); + } + + cvm_emit_add(ctx, tl_buf2, tl_buf3, tl_buf3, fmt); +#endif + + cvm_emit_x_over_y(ctx, y, tl_buf, NULL, tl_buf2, tl_table_answer, tl_table_answer_mantissa, fmt, + true); + + // x0 = atan(y0) + __cvm_atan_fast_emit(ctx, tl_buf2, tl_buf, tl_buf3, tl_y0_buf, tl_invert_buf, tl_pos_neg_table, + tl_table_answer, tl_table_answer_mantissa, OUT tl_ofmap_bf16, fmt); + + bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_buf, CVK_FMT_I8); + + // seperate y >= 0 or < 0 to handle 0 degree / 180 degree + cvm_emit_mask_ge0_lt0(ctx, y, &index_i8, tl_buf3, fmt); + + /** + * step 2. set x == 0, y >=0 to pi/2, y < 0 to -pi/2 + * FIXME: atan(0) not eq PI/2 + */ + + // x = 0 and y != 0 + // reset all x = 0 + // y >= 0 as pi/2, y < 0 as -pi/2 + // merge + + cvm_emit_mask_eq_0(ctx, x, tl_buf, &index_i8, tl_buf2, fmt); + + // clear x = 0 + cvm_emit_mul_const(ctx, tl_buf2, tl_buf, fmt, -1); + cvm_emit_mul(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // get revert map, x = -x + 1 cuz original -1 menas x != 0 + cvm_emit_mul_const(ctx, tl_buf3, tl_buf, fmt, M_PI_2 * degree_factor); + cvm_emit_add_const(ctx, tl_buf2, tl_buf2, fmt, 1); + + cvm_emit_mul(ctx, tl_buf, tl_buf2, tl_buf, fmt); + + cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + // return; + /** + * step 3. handle x < 0 && y != 0 + */ + + // x < 0 + _x_lt_0(ctx, x, tl_buf, &index_i8, fmt, tl_buf2); + + // x < 0 && (y >= 1 && y < 1) + cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf, fmt); + cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, M_PI * degree_factor); + cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + /** + * 4. handle x != 0 && y == 0, x>0: 0, x<0: PI, tpu atan default all pi/2 + */ + // tl_buf2 as x < 0 + // get y == 0, tl_buf3 keep y>=0 is 1, y<1 = -1 + cvm_emit_mask_eq_0(ctx, y, tl_buf, &index_i8, tl_buf3, fmt); + // revert + cvm_emit_mul_const(ctx, tl_buf3, tl_buf, fmt, -1.0); + + // reset y = 0 x = ? as 0, other case leave to step 5 + cvm_emit_mul(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + + /** + * 5. set y == 0 and x < 0 as pi + */ + + // get y == 0 + cvm_emit_add_const(ctx, tl_buf3, tl_buf, fmt, 1.0); + // y == 0 && x < 0 + cvm_emit_mul(ctx, tl_buf, tl_buf2, tl_buf, fmt); + cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, M_PI * degree_factor); + + // merge + cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt); + return; +} + +/** + * \brief reduce lut table with following step + * 1. atan(y/x) + * 2. handle x = 0 && y != 0, directly set pi/2, -pi/2 + * 3. handle x < 0 && y != 0 + * => y>0: PI/2, y <0: -PI/2, tpu atan default y>0: -PI/2, y <0: PI/2 + * 4. handle x != 0 && y == 0, x>0: 0, x<0: PI, tpu atan default all pi/2 + * 5. handle x = 0 && y = 0 => PI + */ +void cvm_atan2_merge_emit(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf, + cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_y0_buf, + cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table, + cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + return _cvm_atan2_merge_emit(ctx, y, x, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, tl_invert_buf, + tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa, + tl_ofmap_bf16, fmt, 1.0); +} + +void cvm_atan2_fast_degree_emit(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf, + cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_y0_buf, + cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table, + cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa, + cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) { + return _cvm_atan2_merge_emit(ctx, y, x, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, tl_invert_buf, + tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa, + tl_ofmap_bf16, fmt, 180 / M_PI); +} diff --git a/cvimath/src/tiu_reciprocal.c b/cvimath/src/tiu_reciprocal.c new file mode 100644 index 000000000..5cf16154d --- /dev/null +++ b/cvimath/src/tiu_reciprocal.c @@ -0,0 +1,149 @@ +/** + */ +#include +#include "gen_lut.h" // NOLINT + +//#define DBG + +/* + * NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type + * + * \tl_buf tmp buffer, the shape MUST be same with \tl_ifmap + * \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used + */ +int cvm_emit_reciprocal(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_buf, + cvk_tl_t* tbl_answer, cvk_tl_t* tbl_answer_mantissa, + cvk_tl_t* OUT tl_ofmap_bf16) { + return cvm_lut_exp_mantissa(ctx, tl_ifmap, tl_buf, tbl_answer, tbl_answer_mantissa, + tl_ofmap_bf16); +} + +// 0, exp from 0 -62 -61 .. 62 63 + for (int i = 0; i < half - 1; i++) { + int shift = (exp_start + i); + bool is_odd = (shift % 2); + float exp = shift; + if (is_odd) { + exp = exp - 1; + } + + double s = _gen_reciprocal(2, exp); + table_data[idx] = convert_fp32_bf16(s); +#ifdef DBG + printf("t [%lu] is %f [idx:%f][2^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), + (float)(exp_start + i), -1 * exp, table_data[idx]); +#endif + idx++; + } + + s = _gen_reciprocal(2, -0); + table_data[idx] = convert_fp32_bf16(s); + table_data[idx] = 0x7F80; //c; i++) { + memcpy(&table_data[i * table_hw], &table_data[0], sizeof(uint16_t) * table_hw); + } +} + +void cvm_gen_reciprocal_mantissa(uint16_t* OUT table_mantissa, cvk_tl_shape_t* table_shape) { + ASSERT(is_1880v2_tbl_shape(table_shape)); + + uint32_t half = half_h_table(); + int table_hw = cvm_table_hw(); + + int idx = 0; + double d; + for (uint32_t i = 0; i < half; i++) { + d = 1 + i * 1 / 128.0; + d = (double)pow(d, -1); + table_mantissa[128 + idx] = convert_fp32_bf16(d); + + // 13=2^3x1.625=(2^2)x(2^1x1.625) + d = 2 * (1 + i * 1 / 128.0); + d = (double)pow(d, -1); + table_mantissa[idx] = convert_fp32_bf16(d); + idx++; + } + +#ifdef DBG + for (uint32_t i = 0; i < 2 * half; i++) { + printf("mantissa [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_mantissa[i]), + table_mantissa[i]); + } +#endif /* ifdef DBG */ + + // duplicate channel #1 to #31 + // TODO: tensor copy + for (uint64_t i = 1; i < table_shape->c; i++) { + memcpy(&table_mantissa[table_hw * i], &table_mantissa[0], sizeof(uint16_t) * table_hw); + } +} + +void cvm_reciprocal_tbl(uint16_t* table_data, uint16_t* table_mantissa, + cvk_tl_shape_t* table_shape) { + ASSERT(table_data); + ASSERT(table_mantissa); + ASSERT(table_shape); + + cvm_gen_reciprocal(table_data, table_shape); + cvm_gen_reciprocal_mantissa(table_mantissa, table_shape); +} diff --git a/cvimath/src/tiu_reshape_c.c b/cvimath/src/tiu_reshape_c.c new file mode 100644 index 000000000..e099c84c7 --- /dev/null +++ b/cvimath/src/tiu_reshape_c.c @@ -0,0 +1,387 @@ +/** + * reshape channel under depthwise + */ +// + +#include +#include "gen_lut.h" // NOLINT + +//#define DBG +// copy from \1880v2_test_util.h +static int calc_dilute_hw(int h, int ins_h, int ins_h_l, int pad_h_b, int pad_h_t) { + return (h - 1) * (ins_h + 1) + ins_h_l + 1 + pad_h_t + pad_h_b; +} + +// get padding as 'SAME' mode in tensorflow +// https://www.jianshu.com/p/05c4f1621c7e +static int get_same_pad(int ih, int sh, int kh) { + return (((ih + sh - 1) / sh) - 1) * sh + kh - ih; +} + +// get real 'h' with pad/ins +static int pooling_ih_ext(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int ih) { + int ins = ins_h; + int ins_last = ins_last_h; + int pad = pad_top + pad_bottom; + return (ih - 1) * (ins + 1) + ins_last + 1 + pad; +} + +// get real 'w' with pad/ins +static int pooling_iw_ext(int ins_w, int ins_last_w, int pad_left, int pad_right, int iw) { + int ins = ins_w; + int ins_last = ins_last_w; + int pad = pad_left + pad_right; + return (iw - 1) * (ins + 1) + ins_last + 1 + pad; +} + +// get output h with parameter +static int pooling_oh(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int stride_h, int ih, + int kh, int dh) { + int ih_ext = pooling_ih_ext(ins_h, ins_last_h, pad_top, pad_bottom, ih); + int d_h = (kh - 1) * dh + 1; + return (ih_ext - d_h) / stride_h + 1; +} + +// get output w with parameter +static int pooling_ow(int ins_w, int ins_last_w, int pad_left, int pad_right, int stride_w, int iw, + int kw, int dw) { + int iw_ext = pooling_iw_ext(ins_w, ins_last_w, pad_left, pad_right, iw); + int d_w = (kw - 1) * dw + 1; + return (iw_ext - d_w) / stride_w + 1; +} + +/** + * \brief get extended bias + * \return allocated new bias + */ +uint32_t* cvm_reshape_channel_bias(uint8_t* bias, int ni, int ci, int hi, int wi, int old_bias_c, + cvk_fmt_t fmt) { + ASSERT(bias); + ASSERT((ni == 2 || ni == 1) && "not support bias batch > 1"); + ASSERT(ci / old_bias_c > 0 && ci % old_bias_c == 0); + int sz = fmt == CVK_FMT_BF16 ? 4 : 2; + + int d_c_bias_sz = ni * ci * hi * wi; + uint8_t* new_bias = (uint8_t*)malloc(d_c_bias_sz * sz); + int bias_hw = hi * wi; + int duplicat_c = ci / old_bias_c; + + for (int c = 0; c < old_bias_c; c++) { + int shift = (c * bias_hw) * sz; + for (int i = 0; i < duplicat_c; i++) { + int new_bias_shift = (c * duplicat_c + i) * bias_hw * sz; + memcpy(&new_bias[new_bias_shift], &bias[shift], bias_hw * sz); + } + } + return (uint32_t*)new_bias; +} + +/* + * \brief prepare load shape/stride + * \return -1 means fail to reshape, 0 means success + * \TODO check memory usage + */ +static inline int _get_dup_shape(cvk_context_t* ctx, int in, int ic, int ih, int iw, int d_kh, + int stride_h, int npu_num, cvk_tl_shape_t* tl_shape, + cvk_tl_stride_t* tl_load_stride, cvk_tg_shape_t* tg_shape, + cvk_tg_stride_t* tg_stride, cvk_fmt_t src_tg_fmt, + cvk_fmt_t dst_tl_fmt) { + ASSERT(in > 0 && ic > 0 && ih > 0 && iw > 0 && d_kh > 0 && stride_h > 0); + ASSERT(tl_shape && tl_load_stride && tg_shape && tg_stride); + + // 1. reshape and extend c, h axis in order + int ch = ic * ih; + int oc; + int oh; + + // FIXME: check kernel setting + oh = 0; + + for (int i = npu_num / ic; i > 0; i--) { +#if 0 + int hw = ih * iw; + int _oh = hw / i / iw; + if (hw % i == 0 && (hw / i) % stride_h == 0 && _oh >= stride_h) { + oh = _oh; + break; + } +#else + int _oh = ih / i; + if (ih % i == 0 && (_oh) % stride_h == 0 && _oh >= stride_h /*&& _oh >= d_kh*/) { + oh = _oh; + break; + } +#endif + } + + if (!oh) { + // FIXME: check terminal condition + return -1; + } + + oc = ch / oh; + +#ifdef DBG + printf("ic:ih is %d %d, oc:oh is %d:%d\n", ic, ih, oc, oh); +#endif + + // tg/tl MUST be same shape size + tl_shape->n = tg_shape->n = 1; + tl_shape->c = tg_shape->c = oc; + tl_shape->h = tg_shape->h = oh; + tl_shape->w = tg_shape->w = iw; + + // init tl + cvk_tl_stride_t s = ctx->ops->tl_default_stride(ctx, *tl_shape, dst_tl_fmt, CTRL_NULL); + tl_load_stride->n = s.n; + tl_load_stride->c = s.c; + tl_load_stride->h = s.h; + tl_load_stride->w = s.w; + + // init tg + cvk_tg_stride_t gs = ctx->ops->tg_default_stride(ctx, *tg_shape, src_tg_fmt); + + tg_stride->n = gs.n; + tg_stride->c = gs.c; + tg_stride->h = gs.h; + + return 0; +} + +/** + * \brief get proper reshape size for depthwise conv with 'same' mode in h direction + * \return -1 means alloc fail + * \NOTICE: not support batch/ins_x/dilated_x/pad_top/pad_bottom + */ +int cvm_reshape_channel_same(cvk_context_t* ctx, int ic, int ih, int iw, int kh, int kw, + int pad_right, int pad_left, int stride_h, int stride_w, + cvk_tl_shape_t* tl_load_shape, cvk_tl_stride_t* new_tl_ifmap_stride, + cvk_tg_shape_t* new_tg_ifmap_shape, + cvk_tg_stride_t* new_tg_ifmap_stride, + cvk_tl_shape_t* new_tl_weight_shape, cvk_tl_shape_t* new_tl_bias_shape, + cvk_tl_shape_t* new_tl_ofmap_shape, cvk_fmt_t fmt, int eu_align) { + ASSERT(eu_align == 0 || eu_align == 1); + + cvk_chip_info_t info = ctx->info; + // TODO: verify dilation_h/dilation_w + int dilation_h = 1; + int dilation_w = 1; + // TODO: verify p->ins_h, p->ins_last_h + int d_kh = calc_dilute_hw(kh, dilation_h - 1, 0, 0, 0); + int h_after = calc_dilute_hw(ih, 0, 0, 0, 0); + int in = 1; + // int h_after = calc_dilute_hw(ih, p->ins_h, p->ins_last_h, p->pad_top, p->pad_bottom); + // int w_after = calc_dilute_hw(iw, p->ins_w, p->ins_last_w, p->pad_left, p->pad_right); + int ret = _get_dup_shape(ctx, in, ic, h_after, iw, d_kh, stride_h, info.npu_num, tl_load_shape, + new_tl_ifmap_stride, new_tg_ifmap_shape, new_tg_ifmap_stride, fmt, fmt); + + if (ret == -1) { + return ret; + } + + new_tl_weight_shape->n = 1; + new_tl_weight_shape->c = tl_load_shape->c; + new_tl_weight_shape->h = kh; + new_tl_weight_shape->w = kw; + + new_tl_bias_shape->n = 2; + new_tl_bias_shape->c = tl_load_shape->c; + new_tl_bias_shape->h = 1; + new_tl_bias_shape->w = 1; + + int pad_h = get_same_pad(tl_load_shape->h, stride_h, kh); + // int no_pad_h = tl_load_shape->h; + + // reserve for padding + new_tg_ifmap_shape->h += pad_h; + tl_load_shape->h += pad_h; + + cvk_tl_stride_t s = ctx->ops->tl_default_stride(ctx, *tl_load_shape, fmt, eu_align); + + new_tl_ifmap_stride->n = s.n; + new_tl_ifmap_stride->c = s.c; + new_tl_ifmap_stride->h = s.h; + new_tl_ifmap_stride->w = s.w; + + // TODO: verity ins_x + int oh = pooling_oh(0, 0, 0, 0, stride_h, tl_load_shape->h, kh, dilation_h); + int ow = pooling_ow(0, 0, pad_left, pad_right, stride_w, tl_load_shape->w, kw, dilation_w); + +#ifdef DBG + printf("new oh/ow pad_h is %d/%d %d\n", oh, ow, pad_h); +#endif + new_tl_ofmap_shape->n = in; + new_tl_ofmap_shape->c = tl_load_shape->c; + new_tl_ofmap_shape->h = oh; + new_tl_ofmap_shape->w = ow; + + return ret; +} + +/* + * \brief duplicate weight for reshaped c + */ +uint8_t* cvm_reshape_channel_weight(uint8_t* weight, int ni, int ci, int hi, int wi, + int old_weight_c, cvk_fmt_t fmt) { + ASSERT(weight); + ASSERT(ci / old_weight_c > 0 && ci % old_weight_c == 0); + + int sz = fmt == CVK_FMT_BF16 ? 2 : 1; + + int new_weight_hw_shape_size = hi * wi; + int new_weight_shape_size = ni * ci * hi * wi; + int duplicat_c = ci / old_weight_c; + uint8_t* new_weight = (uint8_t*)malloc(new_weight_shape_size * sz); + + for (int n = 0; n < ni; n++) { + for (int c = 0; c < old_weight_c; c++) { + int index = (n * old_weight_c + c) * new_weight_hw_shape_size * sz; + for (int i = 0; i < duplicat_c; i++) { + int new_weight_index = + (n * old_weight_c * duplicat_c + c * duplicat_c + i) * new_weight_hw_shape_size * sz; + memcpy(&new_weight[new_weight_index], &weight[index], new_weight_hw_shape_size * sz); + } + } + } + + return new_weight; +} + +/* + * \brief prepare load shape/stride with pad + * \return -1 means fail to reshape, 0 means success + * \TODO check memory usage + */ +static inline int _get_dup_shape_same_pad(cvk_context_t* ctx, int in, int ic, int ih, int iw, + int d_kh, int stride_h, int npu_num, + cvk_tl_shape_t* tl_load_shape, + cvk_tl_stride_t* tl_load_stride, cvk_tg_shape_t* tg_shape, + cvk_tg_stride_t* tg_stride, cvk_fmt_t src_tg_fmt, + cvk_fmt_t dst_tl_fmt) { + ASSERT(in > 0 && ic > 0 && ih > 0 && iw > 0 && d_kh > 0 && stride_h > 0); + ASSERT(tl_load_shape && tl_load_stride && tg_shape && tg_stride); + + // 1. reshape and extend c, h axis in order + int oc; + int oh; + + // FIXME: check kernel setting + oh = 0; + + // 2. get total output + // 3. slice output + ASSERT((ih - d_kh) % stride_h == 0); + int ih_ext = pooling_ih_ext(0, 0, 0, 0, ih); + int _oh = (ih_ext - d_kh) / stride_h + 1; + + for (int i = npu_num / ic; i > 0; i--) { + if (_oh % i == 0) { + // add 1 for later padding + oh = stride_h * (_oh / i - 1) + 1; + oc = i * ic; + break; + } + } + + if (!oh) { + // FIXME: check terminal condition + return -1; + } + +#ifdef DBG + printf("ic:ih is %d %d, oc:oh is %d:%d\n", ic, ih, oc, oh); +#endif + + // tg/tl MUST be same shape size + tl_load_shape->n = tg_shape->n = 1; + tl_load_shape->c = tg_shape->c = oc; + tl_load_shape->h = tg_shape->h = oh; + tl_load_shape->w = tg_shape->w = iw; + + // init tl + cvk_tl_stride_t s = ctx->ops->tl_default_stride(ctx, *tl_load_shape, dst_tl_fmt, CTRL_NULL); + tl_load_stride->n = s.n; + tl_load_stride->c = s.c; + tl_load_stride->h = s.h; + tl_load_stride->w = s.w; + + // init tg + cvk_tg_stride_t gs = ctx->ops->tg_default_stride(ctx, *tg_shape, src_tg_fmt); + + tg_stride->n = gs.n; + tg_stride->c = gs.c; + tg_stride->h = gs.h; + + return 0; +} + +/** + * \brief get proper reshape size for depthwise conv with 'same' mode in h direction + * 'pad' means \ih is padded + * \return -1 means alloc fail + * \NOTICE: not support batch/ins_x/dilated_x/pad_top/pad_bottom + */ +int cvm_reshape_channel_same_pad( + cvk_context_t* ctx, int ic, int ih, int iw, int kh, int kw, int pad_right, int pad_left, + int stride_h, int stride_w, cvk_tl_shape_t* tl_load_shape, cvk_tl_stride_t* new_tl_ifmap_stride, + cvk_tg_shape_t* new_tg_ifmap_shape, cvk_tg_stride_t* new_tg_ifmap_stride, + cvk_tl_shape_t* new_tl_weight_shape, cvk_tl_shape_t* new_tl_bias_shape, + cvk_tl_shape_t* new_tl_ofmap_shape, cvk_fmt_t fmt, int eu_align) { + ASSERT(eu_align == 0 || eu_align == 1); + + cvk_chip_info_t info = ctx->info; + // TODO: verify dilation_h/dilation_w + int dilation_h = 1; + int dilation_w = 1; + // TODO: verify p->ins_h, p->ins_last_h + int d_kh = calc_dilute_hw(kh, dilation_h - 1, 0, 0, 0); + int h_after = calc_dilute_hw(ih, 0, 0, 0, 0); + int in = 1; + // int h_after = calc_dilute_hw(ih, p->ins_h, p->ins_last_h, p->pad_top, p->pad_bottom); + // int w_after = calc_dilute_hw(iw, p->ins_w, p->ins_last_w, p->pad_left, p->pad_right); + int ret = _get_dup_shape_same_pad(ctx, in, ic, h_after, iw, d_kh, stride_h, info.npu_num, + tl_load_shape, new_tl_ifmap_stride, new_tg_ifmap_shape, + new_tg_ifmap_stride, fmt, fmt); + + if (ret == -1) { + return ret; + } + + new_tl_weight_shape->n = 1; + new_tl_weight_shape->c = tl_load_shape->c; + new_tl_weight_shape->h = kh; + new_tl_weight_shape->w = kw; + + new_tl_bias_shape->n = 2; + new_tl_bias_shape->c = tl_load_shape->c; + new_tl_bias_shape->h = 1; + new_tl_bias_shape->w = 1; + + int pad_h = get_same_pad(tl_load_shape->h, stride_h, kh); + // int no_pad_h = tl_load_shape->h; + + // reserve for padding + new_tg_ifmap_shape->h += pad_h; + tl_load_shape->h += pad_h; + + cvk_tl_stride_t s = ctx->ops->tl_default_stride(ctx, *tl_load_shape, fmt, eu_align); + + new_tl_ifmap_stride->n = s.n; + new_tl_ifmap_stride->c = s.c; + new_tl_ifmap_stride->h = s.h; + new_tl_ifmap_stride->w = s.w; + + // TODO: verity ins_x + int oh = pooling_oh(0, 0, 0, 0, stride_h, tl_load_shape->h, kh, dilation_h); + int ow = pooling_ow(0, 0, pad_left, pad_right, stride_w, tl_load_shape->w, kw, dilation_w); + +#ifdef DBG + printf("new oh/ow pad_h is %d/%d %d\n", oh, ow, pad_h); +#endif + new_tl_ofmap_shape->n = in; + new_tl_ofmap_shape->c = tl_load_shape->c; + new_tl_ofmap_shape->h = oh; + new_tl_ofmap_shape->w = ow; + + return ret; +} diff --git a/cvimath/src/tiu_sigmoid.c b/cvimath/src/tiu_sigmoid.c new file mode 100644 index 000000000..1bd61e99d --- /dev/null +++ b/cvimath/src/tiu_sigmoid.c @@ -0,0 +1,266 @@ +/** + * implement Linear interpolation search + * + * we need to pass 2 table, one is answer(lut_answer), another is slope with + * anwser(lut_answer_slope), + * + * for example, we want to get x value + * +------+----+ + * x0 x x1 + * + * the [Linear interpolation defined] (https://en.wikipedia.org/wiki/Linear_interpolation) as + * flowing: + * + * part C part A part B + * +--+ +---+ +----------------------------------------+ + * + * p(x) = f(x0) + ( (f(x1) - f(x0)) / (x1 - x0) ) * (x - x0) + * + * +---+ +-----------------------------+ + * lut_answer lut_answer_slope + */ + +#include +#include "gen_lut.h" // NOLINT + +//#define DBG +/* + * NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type + * + * \tl_buf tmp buffer, the shape MUST be same with \tl_ifmap + * \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used + */ +int cvm_emit_sigmoid(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_buf, + cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_slope, + cvk_tl_t* OUT tl_ofmap_bf16, float scale) { + cvm_table_check(tl_ifmap, tl_table_answer, tl_table_answer_slope, tl_buf); + + cvk_fmt_t fmt = CVK_FMT_BF16; + + cvk_tl_shape_t tl_ofmap_A_idx_int8_shape = {1, tl_buf->shape.c, tl_buf->shape.h * tl_buf->shape.w, + 1}; + + cvk_tdma_l2l_tensor_copy_param_t p10; + + // scale input for remap its idx(-x~x) to (-127~127), dirty tl_ifmap + cvk_tiu_mul_param_t p1; + p1.res_high = NULL; + p1.res_low = tl_ifmap; + p1.a = tl_ifmap; + p1.b_is_const = 1; + p1.b_const.val = convert_fp32_bf16(scale); + p1.rshift_bits = 0; + p1.relu_enable = 0; + + ctx->ops->tiu_mul(ctx, &p1); + + // int8 + // save by stride + memset(&p10, 0x00, sizeof(cvk_tdma_l2l_tensor_copy_param_t)); + cvk_tl_t dst; + memcpy(&dst, tl_ofmap_bf16, sizeof(cvk_tl_t)); + dst.fmt = CVK_FMT_I8; + dst.shape = tl_ofmap_A_idx_int8_shape; + // dst.stride = ctx->ops->tl_default_stride(ctx, dst.shape, /*eu_align*/ 1, + // dst.fmt); + dst.stride = ctx->ops->tl_default_stride(ctx, dst.shape, dst.fmt, CTRL_NULL); + dst.stride.h = dst.stride.h * 2; + dst.int8_rnd_mode = 1; + p10.dst = &dst; + p10.src = tl_ifmap; + ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); + dst.int8_rnd_mode = 0; // reset + + // ops->tdma_l2l_bf16_tensor_copy(ctx, &p10); + + // ops->tiu_sub(ctx, &p5); + + // get f(x0) and slope(x) + // reshape, 16->16 + dst.fmt = fmt; + dst.shape = tl_buf->shape; + dst.stride = tl_buf->stride; + + // ops->tiu_lookup_table(ctx, &p12); + + // base f(x0) + memset(&p12, 0x0, sizeof(cvk_tiu_lookup_table_param_t)); + p12.ofmap = tl_ofmap_bf16; + p12.ifmap = &dst; + p12.table = tl_table_answer; + ctx->ops->tiu_lookup_table(ctx, &p12); + + // ops->tiu_mac(ctx, &p2); + return 0; +} + +static double _gen_sigmoid(float x) { return 1.0 / (1.0 + exp(-(x))); } + +double* cvm_gen_sigmoid_double() { + int table_hw = cvm_table_hw(); + return (double*)malloc(sizeof(double) * table_hw); +} + +void cvm_free_sigmoid_double(double* sigmode_hw) { free(sigmode_hw); } + +void cvm_gen_sigmoid(uint16_t* table_data, cvk_tl_shape_t* table_shape, double* sigmode_hw, + float scale, int range_start) { + // S(x) = 1 / (1 + (e^-x)) + //c; i++) { + memcpy(&table_data[i * table_hw], &table_data[0], sizeof(uint16_t) * table_hw); + } +} + +float cvm_sigmoid_scale(int range_start, int range_end) { + int table_hw = cvm_table_hw(); + return table_hw / (1.0 * abs(range_start - range_end)); // 256 / 16 = 16 +} + +void cvm_gen_sigmoid_slope(uint16_t* OUT table_slope, cvk_tl_shape_t* table_shape, + double* sigmode_hw, float scale, int range_start, int range_end) { + ASSERT(is_1880v2_tbl_shape(table_shape)); + + int half = half_h_table(); + int table_hw = cvm_table_hw(); + + for (int i = 0; i < table_hw; i++) { + double x0 = sigmode_hw[i]; + // double x1 = sigmode_hw[i + 1]; + double x1; + double delta = 1.0; + if (i == half - 1) { + // half) { + x0 = sigmode_hw[i]; + x1 = sigmode_hw[i - 1]; + delta = -1.0; + } else { + // for avoid -fsanitize=address check + x1 = sigmode_hw[i + 1]; + } + double s = (x1 - x0) / delta; // x1 already scale up + table_slope[i] = convert_fp32_bf16((float)s); +#ifdef GDB + printf("slope table [%u] = (bf16 %f double %.8lf float %f), 0x%x, %.8lf - %.8lf(%.8lf)\n", i, + convert_bf16_fp32(table_slope[i]), s, (float)s, table_slope[i], x1, x0, x1 - x0); +#endif + } + + // duplicate channel #1 to #31 + + // TODO: tensor copy + for (uint64_t i = 1; i < table_shape->c; i++) { + memcpy(&table_slope[table_hw * i], &table_slope[0], sizeof(uint16_t) * table_hw); + } +} + +void cvm_sigmoid_tbl(uint16_t* sigmoid_table_data, uint16_t* sigmoid_table_data_slope, + cvk_tl_shape_t* table_shape, int range_start, int range_end) { + ASSERT(sigmoid_table_data); + ASSERT(sigmoid_table_data_slope); + ASSERT(table_shape); + + double* sigmode_hw = cvm_gen_sigmoid_double(); + + float scale = cvm_sigmoid_scale(range_start, range_end); + + cvm_gen_sigmoid(sigmoid_table_data, table_shape, sigmode_hw, scale, range_start); + + cvm_gen_sigmoid_slope(sigmoid_table_data_slope, table_shape, sigmode_hw, scale, range_start, + range_end); + + cvm_free_sigmoid_double(sigmode_hw); +} diff --git a/cvimath/src/tiu_sqrt.c b/cvimath/src/tiu_sqrt.c new file mode 100644 index 000000000..1977a49ab --- /dev/null +++ b/cvimath/src/tiu_sqrt.c @@ -0,0 +1,121 @@ +/** + */ +#include +#include "gen_lut.h" // NOLINT + +//#define DBG +/* + * NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type + * + * \tl_buf tmp buffer, the shape MUST be same with \tl_ifmap + * \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used + */ +int cvm_emit_sqrt(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_buf, + cvk_tl_t* tbl_answer, cvk_tl_t* tbl_answer_mantissa, + cvk_tl_t* OUT tl_ofmap_bf16) { + return cvm_lut_exp_mantissa(ctx, tl_ifmap, tl_buf, tbl_answer, tbl_answer_mantissa, + tl_ofmap_bf16); +} + +static double _gen_sqrt(int base, int p) { + // y = x ^ 0.5 + double f = (double)(pow(base, p * 0.5)); + + if (isnan(f)) { + ASSERT(0); + } + return f; +} + +void cvm_gen_sqrt(uint16_t* table_data, cvk_tl_shape_t* table_shape) { + ASSERT(is_1880v2_tbl_shape(table_shape)); + + int exp_start = cvm_exp_start(); + int half = half_h_table(); + int table_hw = cvm_table_hw(); + uint64_t idx = 0; + + // prepare channel 0 + double s = 0.0; + table_data[idx] = convert_fp32_bf16(s); // 0^0.5 = 0 +#ifdef DBG + printf("t [%lu] is %f(%.8lf)[idx:%f][2^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s, + (float)exp_start, (float)(exp_start / 2), table_data[idx]); +#endif + idx++; + + // > 0, exp from 0 -62 -61 .. 62 63 + for (int i = 0; i < half; i++) { + int shift = (exp_start + i); + bool is_odd = (shift % 2); + float exp = shift; + if (is_odd) { + exp = exp - 1; + } + + double s = _gen_sqrt(2, exp); + table_data[idx] = convert_fp32_bf16(s); +#ifdef DBG + printf("t [%lu] is %f [idx:%f][2^%f(%f)] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), + (float)(exp_start + i), exp / 2, (exp_start + i) / 2.0, table_data[idx]); +#endif + idx++; + } + + //// idx = 127 dont care + // duplicate channel #1 to #channel + // TODO: tensor copy + + for (uint32_t i = 1; i < table_shape->c; i++) { + memcpy(&table_data[i * table_hw], &table_data[0], sizeof(uint16_t) * table_hw); + } +} + +void cvm_gen_sqrt_mantissa(uint16_t* OUT table_mantissa, cvk_tl_shape_t* table_shape) { + ASSERT(is_1880v2_tbl_shape(table_shape)); + + uint32_t half = half_h_table(); + int table_hw = cvm_table_hw(); + + int idx = 0; + double d; + for (uint32_t i = 0; i < half; i++) { + d = 1 + i * 1 / 128.0; + d = (double)pow(d, 0.5); + table_mantissa[128 + idx] = convert_fp32_bf16(d); +#ifdef DBG + // printf(", [%u] is %lf\n", i+128, d); +#endif /* ifdef DBG */ + + // 13=2^3x1.625=(2^2)x(2^1x1.625) + d = 2 * (1 + i * 1 / 128.0); + d = (double)pow(d, 0.5); + table_mantissa[idx] = convert_fp32_bf16(d); +#ifdef DBG + // printf("mantissa [%u] is %lf", i, d); +#endif /* ifdef DBG */ + idx++; + } +#ifdef DBG + for (uint32_t i = 0; i < 2 * half; i++) { + printf("mantissa [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_mantissa[i]), + table_mantissa[i]); + } +#endif /* ifdef DBG */ + + // duplicate channel #1 to #31 + // TODO: tensor copy + for (uint64_t i = 1; i < table_shape->c; i++) { + memcpy(&table_mantissa[table_hw * i], &table_mantissa[0], sizeof(uint16_t) * table_hw); + } +} + +void cvm_sqrt_tbl(uint16_t* sqrt_table_data, uint16_t* sqrt_table_data_mantissa, + cvk_tl_shape_t* table_shape) { + ASSERT(sqrt_table_data); + ASSERT(sqrt_table_data_mantissa); + ASSERT(table_shape); + + cvm_gen_sqrt(sqrt_table_data, table_shape); + cvm_gen_sqrt_mantissa(sqrt_table_data_mantissa, table_shape); +} diff --git a/cvimath/src/tiu_upsample.c b/cvimath/src/tiu_upsample.c new file mode 100644 index 000000000..4d924f02b --- /dev/null +++ b/cvimath/src/tiu_upsample.c @@ -0,0 +1,54 @@ +#include +#include "gen_lut.h" + +int cvm_upsample2d(cvk_context_t* ctx, cvk_tl_t* tl_input, cvk_tl_t* tl_weight, + cvk_tl_t* tl_output) { + int ih = tl_input->shape.h; + int iw = tl_input->shape.w; + int sh = tl_weight->shape.h; + int sw = tl_weight->shape.w; + int kh = sh; + int kw = sw; + + int pt = 0; + int pl = 0; + int pr = 0; + int pb = 0; + int dh = 1; + int dw = 1; + + int ow = tl_output->shape.w; + int oh = tl_output->shape.h; + int kh_ext = (kh - 1) * dh + 1; + int kw_ext = (kw - 1) * dw + 1; + int ins_h = sh - 1; + int ins_w = sw - 1; + int pad_t = kh_ext - pt - 1; + int pad_l = kw_ext - pl - 1; + int pad_b = oh + pb - (ih - 1) * sh - 1; + int pad_r = ow + pr - (iw - 1) * sw - 1; + + cvk_tiu_depthwise_pt_convolution_param_t param = {0}; + param.ofmap = tl_output; + param.ifmap = tl_input; + param.weight = tl_weight; + param.bias = 0; + param.ins_h = ins_h; + param.ins_last_h = 0; + param.ins_w = ins_w; + param.ins_last_w = 0; + param.stride_h = 1; + param.stride_w = 1; + param.dilation_h = 1; + param.dilation_w = 1; + param.pad_top = pad_t; + param.pad_bottom = pad_b; + param.pad_left = pad_l; + param.pad_right = pad_r; + param.relu_enable = 0; + param.ins_val = 0; // symmetric quantization + param.ins_fp = 0; // symmetric quantization + ctx->ops->tiu_pt_depthwise_convolution(ctx, ¶m); + + return 0; +} diff --git a/cvimath/src/util.c b/cvimath/src/util.c new file mode 100644 index 000000000..d19a9cb43 --- /dev/null +++ b/cvimath/src/util.c @@ -0,0 +1,270 @@ +#include +#include +#include +#include +#include + +#include "test_cvikernel_util.h" + +#define container_of(ptr, type, member) \ + ({ \ + const typeof(((type *)0)->member) *__mptr = (ptr); \ + (type *)((char *)__mptr - offsetof(type, member)); \ + }) + +typedef struct { + cvk_tg_t tg; + CVI_RT_MEM mem; +} test_tg_wrapper_t; + +typedef struct { + cvk_mg_t mg; + CVI_RT_MEM mem; +} test_mg_wrapper_t; + +void test_submit_comp(CVI_RT_HANDLE *bm_ctx, cvk_context_t *cvk_ctx) { + (void)cvk_ctx; + (void)bm_ctx; + CVI_RT_Submit(cvk_ctx); +} + +cvk_tg_t *test_alloc_tg_mem_comp(CVI_RT_HANDLE *bm_ctx, cvk_context_t *cvk_ctx, + cvk_tg_shape_t shape, cvk_fmt_t fmt) { + CVI_RT_HANDLE ctx = (CVI_RT_HANDLE)*bm_ctx; + int alloc_sz = tg_shape_size(&shape) * bytesize_of_fmt(fmt); + + test_tg_wrapper_t *w = (test_tg_wrapper_t *)malloc(sizeof(*w)); + assert(w && "Expected allocated tg wrapper"); + + w->tg.base_reg_index = 0; + w->mem = CVI_RT_MemAlloc(ctx, alloc_sz); + w->tg.start_address = CVI_RT_MemGetPAddr(w->mem); + w->tg.fmt = fmt; + w->tg.shape = shape; + w->tg.stride = cvk_ctx->ops->tg_default_stride(cvk_ctx, shape, fmt); + + return &w->tg; +} + +cvk_mg_t *test_alloc_mg_mem_comp(CVI_RT_HANDLE *bm_ctx, cvk_mg_shape_t s, cvk_fmt_t fmt) { + int alloc_sz = mg_shape_size(&s) * bytesize_of_fmt(fmt); + CVI_RT_HANDLE ctx = (CVI_RT_HANDLE)*bm_ctx; + + test_mg_wrapper_t *w = (test_mg_wrapper_t *)malloc(sizeof(*w)); + w->mem = CVI_RT_MemAlloc(ctx, alloc_sz); + + w->mg.base_reg_index = 0; + w->mg.start_address = CVI_RT_MemGetPAddr(w->mem); + w->mg.shape = s; + w->mg.fmt = fmt; + w->mg.stride.row = s.col * bytesize_of_fmt(fmt); + + return &w->mg; +} + +void test_free_tg_mem_comp(CVI_RT_HANDLE *ctx, const cvk_tg_t *tg) { + test_tg_wrapper_t *w = container_of(tg, test_tg_wrapper_t, tg); + CVI_RT_MemFree(*ctx, w->mem); + + free(w); +} + +void test_free_mg_mem_comp(CVI_RT_HANDLE *ctx, const cvk_mg_t *mg) { + test_mg_wrapper_t *w = container_of(mg, test_mg_wrapper_t, mg); + CVI_RT_MemFree(*ctx, w->mem); + + free(w); +} + +void test_put_tg_mem_comp(CVI_RT_HANDLE *bm_ctx, const cvk_tg_t *tg, uint8_t data[]) { + test_tg_wrapper_t *w = container_of(tg, test_tg_wrapper_t, tg); + CVI_RT_MemCopyS2D(*bm_ctx, w->mem, data); +} + +void test_put_mg_mem_comp(CVI_RT_HANDLE *ctx, const cvk_mg_t *mg, uint8_t data[]) { + test_mg_wrapper_t *w = (typeof(w))mg; + CVI_RT_MemCopyS2D(*ctx, w->mem, data); +} + +uint8_t *test_get_tg_mem_comp(CVI_RT_HANDLE *bm_ctx, const cvk_tg_t *tg) { + cvk_tg_shape_t s = tg->shape; + + int data_type_size = 1; + if (tg->fmt == CVK_FMT_BF16) { + data_type_size = 2; + } + + uint32_t size = s.n * s.c * s.h * s.w * data_type_size; + uint8_t *data = (uint8_t *)malloc(size); + assert(data && "Expect allocated data for get tg mem"); + + test_tg_wrapper_t *w = container_of(tg, test_tg_wrapper_t, tg); + CVI_RT_MemCopyD2S(*bm_ctx, data, w->mem); + + return data; +} + +uint8_t *test_get_mg_mem_comp(CVI_RT_HANDLE *ctx, const cvk_mg_t *mg) { + cvk_mg_shape_t s = mg->shape; + uint32_t size = s.row * s.col * (mg->fmt == CVK_FMT_BF16 ? 2 : 1); + uint8_t *data = (uint8_t *)malloc(size); + assert(data && "Expect allocated data for get mg mem"); + + test_mg_wrapper_t *w = container_of(mg, test_mg_wrapper_t, mg); + CVI_RT_MemCopyD2S(*ctx, data, w->mem); + + return data; +} + +uint8_t *test_get_tensor_l2g_comp(CVI_RT_HANDLE *bm_ctx, cvk_context_t *cvk_ctx, + const cvk_tl_t *tl) { + cvk_tg_shape_t s; + s.n = tl->shape.n; + s.c = tl->shape.h; + s.h = tl->shape.w; + s.w = tl->shape.c; + cvk_tg_t *tg = test_alloc_tg_mem_comp(bm_ctx, cvk_ctx, s, tl->fmt); + + cvk_tdma_l2g_tensor_copy_param_t p; + p.src = tl; + p.dst = tg; + + if (tl->fmt == CVK_FMT_BF16) { + cvk_ctx->ops->tdma_l2g_bf16_tensor_copy(cvk_ctx, &p); + } else { + cvk_ctx->ops->tdma_l2g_tensor_copy(cvk_ctx, &p); + } + test_submit_comp(bm_ctx, cvk_ctx); + uint8_t *data = test_get_tg_mem_comp(bm_ctx, tg); + + test_free_tg_mem_comp(bm_ctx, tg); + return data; +} + +uint8_t *test_get_matrix_l2g_comp(CVI_RT_HANDLE *ctx, cvk_context_t *cvk_ctx, const cvk_ml_t *ml) { + cvk_mg_shape_t s; + s.row = ml->shape.n; + s.col = ml->shape.col; + cvk_mg_t *mg = test_alloc_mg_mem_comp(ctx, s, ml->fmt); + + cvk_tdma_l2g_matrix_copy_param_t p; + p.src = ml; + p.dst = mg; + + if (ml->fmt == CVK_FMT_BF16) { + cvk_ctx->ops->tdma_l2g_bf16_matrix_copy(cvk_ctx, &p); + } else { + cvk_ctx->ops->tdma_l2g_matrix_copy(cvk_ctx, &p); + } + + test_submit_comp(ctx, cvk_ctx); + + uint8_t *data = test_get_mg_mem_comp(ctx, mg); + + test_free_mg_mem_comp(ctx, mg); + + return data; +} + +void test_put_tensor_g2l_comp(CVI_RT_HANDLE *bm_ctx, cvk_context_t *cvk_ctx, const cvk_tl_t *tl, + uint8_t data[]) { + cvk_tg_shape_t tg_shape; + tg_shape.n = tl->shape.n; + tg_shape.c = tl->shape.c; + tg_shape.h = tl->shape.h; + tg_shape.w = tl->shape.w; + + cvk_tg_t *tg = test_alloc_tg_mem_comp(bm_ctx, cvk_ctx, tg_shape, tl->fmt); + + cvk_tdma_g2l_tensor_copy_param_t p; + p.src = tg; + p.dst = tl; + + test_put_tg_mem_comp(bm_ctx, tg, data); + + if (tl->fmt == CVK_FMT_BF16) { + cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &p); + } else { + cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p); + } + test_submit_comp(bm_ctx, cvk_ctx); + + test_free_tg_mem_comp(bm_ctx, tg); +} + +void test_put_matrix_g2l_comp(CVI_RT_HANDLE *bm_ctx, cvk_context_t *cvk_ctx, const cvk_ml_t *ml, + uint8_t data[]) { + cvk_fmt_t mg_data_format = ml->fmt; + cvk_mg_shape_t s; + s.row = ml->shape.n; + s.col = ml->shape.col; + cvk_mg_t *mg = test_alloc_mg_mem_comp(bm_ctx, s, mg_data_format); + + cvk_tdma_g2l_matrix_copy_param_t p; + p.src = mg; + p.dst = ml; + + test_put_mg_mem_comp(bm_ctx, mg, data); + if (ml->fmt == CVK_FMT_BF16) { + cvk_ctx->ops->tdma_g2l_bf16_matrix_copy(cvk_ctx, &p); + } else { + cvk_ctx->ops->tdma_g2l_matrix_copy(cvk_ctx, &p); + } + + test_submit_comp(bm_ctx, cvk_ctx); + + test_free_mg_mem_comp(bm_ctx, mg); +} + +cvk_mg_t *test_put_matrix_g(CVI_RT_HANDLE *bm_ctx, const cvk_mg_shape_t s, cvk_fmt_t mg_data_format, + uint8_t data[]) { + cvk_mg_t *mg = test_alloc_mg_mem_comp(bm_ctx, s, mg_data_format); + + test_put_mg_mem_comp(bm_ctx, mg, data); + return mg; +} + +cvk_tl_t *test_alloc_tl(cvk_context_t *cvk_ctx, cvk_tl_shape_t shape, cvk_fmt_t fmt, int eu_align) { + cvk_tl_t *tl = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt, eu_align); + return tl; +} + +void test_free_tl(cvk_context_t *cvk_ctx, const cvk_tl_t *t) { + cvk_ctx->ops->lmem_free_tensor(cvk_ctx, t); +} + +#define CNV_SCALAR_C_ALIGN (0x1000) +inline uint64_t cnvAlign64(const uint64_t length, const uint64_t align) { + uint64_t stride = (uint64_t)(length / align) * align; + if (stride < length) { + stride += align; + } + return stride; +} +uint8_t *test_get_vp_addr(bmctx_t *ctx, AddrInfo *pAddrInfo) +{ + + if(pAddrInfo->vir_addr){ + test_free_vp_addr(ctx, pAddrInfo); + } + pAddrInfo->mem = bmmem_device_alloc_raw(*ctx, pAddrInfo->size_bytes); + pAddrInfo->vir_addr = (uint8_t *)bmmem_device_v_addr(pAddrInfo->mem);; + pAddrInfo->phy_addr = bmmem_device_addr(pAddrInfo->mem); + + + uint64_t new_paddr = cnvAlign64(pAddrInfo->phy_addr, CNV_SCALAR_C_ALIGN); + uint64_t offset = new_paddr - pAddrInfo->phy_addr; + pAddrInfo->phy_addr = new_paddr; + pAddrInfo->vir_addr += offset; + + return pAddrInfo->vir_addr; +} + +void test_free_vp_addr(bmctx_t *ctx, AddrInfo *pAddrInfo){ + + bmmem_device_free(*ctx, pAddrInfo->mem); + pAddrInfo->phy_addr = -1; + pAddrInfo->vir_addr = NULL; + //pAddrInfo->size_bytes = 0; + +} diff --git a/cvimath/tests/CMakeLists.txt b/cvimath/tests/CMakeLists.txt new file mode 100644 index 000000000..feacbdcd0 --- /dev/null +++ b/cvimath/tests/CMakeLists.txt @@ -0,0 +1,34 @@ +project(cvimath) + +include(CTest) + +include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include") + +file(GLOB _TEST_UTILS "${CMAKE_CURRENT_SOURCE_DIR}/common/*c") + +# cvi1835 test +include_directories( + ${CMAKE_SOURCE_DIR}/include + ${CMAKE_SOURCE_DIR}/src + ) +file(GLOB CVI1835_TESTS cvi1835/*.cpp) + +# FIXME: repair test case +list(FILTER CVI1835_TESTS EXCLUDE REGEX ".*atan2.*") +list(FILTER CVI1835_TESTS EXCLUDE REGEX ".*depthwise_reshape_same.*") + +foreach(TEST_SRC ${CVI1835_TESTS}) + get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE) + + add_executable(${TEST_NAME} ${_TEST_UTILS} ${TEST_SRC}) + target_link_libraries(${TEST_NAME} ${TPU_KERNEL_LIB} ${TEST_LIBS}) + set_target_properties(${TEST_NAME} PROPERTIES COMPILE_FLAGS "-Werror -Wall -Wextra") + install(TARGETS ${TEST_NAME} DESTINATION bin) + + add_test(${TEST_NAME} ${TEST_NAME} ctest_test) +endforeach() + +#add_library(${PROJECT_NAME} SHARED ${SRC}) +#target_link_libraries(${PROJECT_NAME} ${TPU_KERNEL_LIB}) +#install(TARGETS ${PROJECT_NAME} DESTINATION tests) + diff --git a/cvimath/tests/common/test_native_ref.c b/cvimath/tests/common/test_native_ref.c new file mode 100644 index 000000000..f06db908c --- /dev/null +++ b/cvimath/tests/common/test_native_ref.c @@ -0,0 +1,980 @@ +#include +#include +#include +#include +#include +#include +#include + +#include + +#define math_min(x, y) ((x) < (y) ? (x) : (y)) +#define math_max(x, y) ((x) > (y) ? (x) : (y)) + +typedef uint8_t uint8_t; +typedef uint16_t uint16_t; +typedef uint32_t uint32_t; +typedef uint64_t uint64_t; + +typedef int8_t int8_t; +typedef int16_t int16_t; +typedef int32_t int32_t; +typedef int64_t s64; +typedef uint32_t bmerr_t; + +#define BM_SUCCESS 0 // The operation was successful +#define BM_ERR_AGAIN 1 // Not ready yet +#define BM_ERR_FAILURE 2 // General failure +#define BM_ERR_TIMEOUT 3 // Timeout +#define BM_ERR_UNINITIALIZED 4 // Uninitialzed +#define BM_ERR_INVALID_ARGUMENT 5 // Arguments invalid +#define BM_ERR_NOMEM 6 // Not enough memory +#define BM_ERR_DATA 7 // Data error +#define BM_ERR_BUSY 8 // Busy +#define BM_ERR_NOT_SUPPORTED 9 // Not supported yet + +typedef uint32_t BLOB_OP; +#define BLOB_ADD 0 +#define BLOB_SUB 1 +#define BLOB_MUL 2 +#define BLOB_DIV 3 +#define BLOB_INVALID 4 + +static inline int calc_offset(int *shape, int *offset) { + return ((offset[0] * shape[1] + offset[1]) * shape[2] + offset[2]) * shape[3] + offset[3]; +} + +static int index_get(int h, int w1, int w2) { return h * w1 + w2; } + +int array_cmp_float_rel(const char *const info, float *p_exp, float *p_got, int count, + float delta) { + int idx = 0; + for (idx = 0; idx < count; idx++) { + if (math_max(fabs(p_exp[idx]), fabs(p_got[idx])) > 1.0) { + // compare rel + if (math_min(fabs(p_exp[idx]), fabs(p_got[idx])) < 1e-20) { + printf("%s rel error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]); + if (isnan(p_exp[idx]) && isnan(p_got[idx])) { + printf("both exp and got are NAN"); + return 0; + } + return -1; + } + if (fabs(p_exp[idx] - p_got[idx]) > delta * math_min(fabs(p_exp[idx]), fabs(p_got[idx]))) { + printf("%s rel error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]); + if (isnan(p_exp[idx]) && isnan(p_got[idx])) { + printf("both exp and got are NAN"); + return 0; + } + return -1; + } + } else { + if (fabs(p_exp[idx] - p_got[idx]) > delta) { + printf("%s abs error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]); + if (isnan(p_exp[idx]) && isnan(p_got[idx])) { + printf("both exp and got are NAN"); + return 0; + } + return -1; + } + } + + if (isnan(p_got[idx]) && !isnan(p_exp[idx])) { + printf("%s, found nans idx %d\n", info, idx); + printf("floating from exp %.10f got %.10f\n", p_exp[idx], p_got[idx]); + IF_VAL exp, got; + exp.fval = p_exp[idx]; + got.fval = p_got[idx]; + printf("hex form exp %8.8x got %8.8x\n", exp.ival, got.ival); + return -2; + } + } + return 0; +} + +int array_cmp_float(const char *const info, float *p_exp, float *p_got, int count, float delta) { + if (delta == 0.0f) { + for (int idx = 0; idx < count; idx++) { + if (p_exp[idx] != p_got[idx]) { + printf("%s error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]); + if (isnan(p_exp[idx]) && isnan(p_got[idx])) { + printf("both exp and got are NAN\n"); + return 0; + } + return -1; + } + } + } else { + return array_cmp_float_rel(info, p_exp, p_got, count, delta); + } + return 0; +} + +int array_cmp_int(const char *const info, int *p_exp, int *p_got, int count) { + int idx; + for (idx = 0; idx < count; idx++) { + if (p_exp[idx] != p_got[idx]) { + printf("%s error at index %d exp %d got %d\n", info, idx, p_exp[idx], p_got[idx]); + return -1; + } + } + return 0; +} + +int array_cmp_int8(const char *const info, const int8_t *p_exp, const int8_t *p_got, int count) { + int idx; + for (idx = 0; idx < count; idx++) { + if (p_exp[idx] != p_got[idx]) { + printf("%s error at index %d exp %d got %d\n", info, idx, p_exp[idx], p_got[idx]); + return -1; + } + } + return 0; +} + +int calc_dilute_hw(int h, int ins_h, int ins_h_l, int pad_h_b, int pad_h_t) { + return (h - 1) * (ins_h + 1) + ins_h_l + 1 + pad_h_t + pad_h_b; +} + +int calc_output_hw(int hw, int khw, int stride) { return (hw - khw) / stride + 1; } + +int fill_pad_fmap_int8(const int8_t *before, int8_t **pafter, int val, int pad_l, int pad_r, + int pad_t, int pad_b, int ins_h, int ins_w, int ins_h_last, int ins_w_last, + int h_before, int w_before) { + int w_after = (w_before - 1) * (ins_w + 1) + ins_w_last + 1 + pad_l + pad_r; + int h_after = (h_before - 1) * (ins_h + 1) + ins_h_last + 1 + pad_t + pad_b; + int8_t *after = *pafter; + if (!before || !pafter) return BM_ERR_INVALID_ARGUMENT; + + if (!after) { + after = malloc(sizeof(int8_t) * w_after * h_after); + if (!after) return BM_ERR_NOMEM; + } + + memset(after, val, w_after * h_after); + for (int h = 0; h < h_before; h++) { + for (int w = 0; w < w_before; w++) { + int i = (h * (ins_h + 1) + pad_t) * w_after + w * (ins_w + 1) + pad_l; + after[i] = before[h * w_before + w]; + } + } + + *pafter = after; + return BM_SUCCESS; +} + +int fill_pad_fmap_bf16(const uint16_t *before, uint16_t **pafter, int val, int pad_l, int pad_r, + int pad_t, int pad_b, int ins_h, int ins_w, int ins_h_last, int ins_w_last, + int h_before, int w_before) { + int w_after = (w_before - 1) * (ins_w + 1) + ins_w_last + 1 + pad_l + pad_r; + int h_after = (h_before - 1) * (ins_h + 1) + ins_h_last + 1 + pad_t + pad_b; + uint16_t *after = *pafter; + if (!before || !pafter) return BM_ERR_INVALID_ARGUMENT; + if (!after) { + after = malloc(sizeof(uint16_t) * w_after * h_after); + if (!after) return BM_ERR_NOMEM; + } + for (int i = 0; i < w_after * h_after; i++) after[i] = val; + + for (int h = 0; h < h_before; h++) { + for (int w = 0; w < w_before; w++) { + int i = (h * (ins_h + 1) + pad_t) * w_after + w * (ins_w + 1) + pad_l; + after[i] = before[h * w_before + w]; + } + } +#if 0 + printf("bf16 padding:\n"); + for(int i=0;i= 0 && in_y < input_h && in_x >= 0 && in_x < input_w) { + weight_offset[0] = o + o_head; + weight_offset[1] = k; + if (flip) { + weight_offset[2] = (kh - 1 - p); + weight_offset[3] = (kw - 1 - q); + } else { + weight_offset[2] = p; + weight_offset[3] = q; + } + in_offset[0] = n; + in_offset[1] = k + k_head; + in_offset[2] = in_y; + in_offset[3] = in_x; + ofmap_f[calc_offset(o_shape, out_offset)] += + ifmap_f[calc_offset(i_shape, in_offset)] * + weight_f[calc_offset(k_shape, weight_offset)]; + if (k_g == 1 && kh == 1 && kw == 1) { + ofmap_f[calc_offset(o_shape, out_offset)] = + ifmap_f[calc_offset(i_shape, in_offset)] * + weight_f[calc_offset(k_shape, weight_offset)]; + } + } + } + } + } + if (using_bias) { + ofmap_f[calc_offset(o_shape, out_offset)] += bias_f[o + o_head]; + } + if (result_add) { + ofmap_f[calc_offset(o_shape, out_offset)] += result_init; + } + } + } + } + } + } +} + +int native_fc_int8(const int8_t *L, const int8_t *R, const int16_t *B, const int16_t *Y, int *Y_ref, + int L_row_num, int L_col_num, int R_col_num, int L_sign, int R_sign, int B_sign, + int l_shift_width, int r_shift_width, int is_result_int8, int do_relu) { + const uint8_t *uL = (const uint8_t *)L; + const uint8_t *uR = (const uint8_t *)R; + const uint16_t *uB = (const uint16_t *)B; + + int opd0, opd1, opd2; + int ret = BM_SUCCESS; + + for (int hidx = 0; hidx < L_row_num; hidx++) { + for (int widx = 0; widx < R_col_num; widx++) { + int Y1 = 0; + int Y2 = 0; + int sum_idx = 0; + for (sum_idx = 0; sum_idx < L_col_num; sum_idx++) { + int idx_L = index_get(hidx, L_col_num, sum_idx); + int idx_R = index_get(sum_idx, R_col_num, widx); + opd0 = (L_sign) ? L[idx_L] : uL[idx_L]; + opd1 = (R_sign) ? R[idx_R] : uR[idx_R]; + if ((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1) { + Y1 += opd0 * opd1; + } else { + Y2 += opd0 * opd1; + } + } + sum_idx++; + + if (B) { + opd2 = (B_sign) ? (int)B[widx] : (int)uB[widx]; + if ((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1) { + Y1 += opd2; + } else { + Y2 += opd2; + } + sum_idx++; + } + + int idx_Y = index_get(hidx, R_col_num, widx); + if (Y) { + if ((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1) { + Y1 += (Y[idx_Y] << l_shift_width); + } else { + Y2 += (Y[idx_Y] << l_shift_width); + } + } + + Y_ref[idx_Y] = Y1 + Y2; + } + } + uint8_t *Yout_int8 = malloc(sizeof(int8_t) * L_row_num * R_col_num); + uint16_t *Yout_int16 = malloc(sizeof(int16_t) * L_row_num * R_col_num); + + if (is_result_int8) { + ret = + satu_2_8bit(Y_ref, L_row_num * R_col_num, (int8_t *)Yout_int8, r_shift_width, 1, !do_relu); + if (ret != BM_SUCCESS) goto error_release; + + fill_int_with_int8(Y_ref, (int8_t *)Yout_int8, L_row_num * R_col_num); + } else { + ret = satu_2_16bit(Y_ref, L_row_num * R_col_num, (int16_t *)Yout_int16, r_shift_width, 1, + !do_relu); + if (ret != BM_SUCCESS) goto error_release; + + fill_int_with_int16(Y_ref, (int16_t *)Yout_int16, L_row_num * R_col_num); + } + +error_release: + free(Yout_int8); + free(Yout_int16); + + return ret; +} + +int native_pooling_ave_int8(const int8_t *i_fmap, const void *weight, const int16_t *bias, + int8_t *o_fmap, int input_n, int input_c, int input_h, int input_w, + int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, + int stride_h, int stride_w, int ins_h, int ins_w, int ins_h_last, + int ins_w_last, int input_sign, int satu_sign, int r_shift_width, + int const_weight) { + if (kh * kw <= 0) return BM_ERR_INVALID_ARGUMENT; + + int *avg_pooling_mac_a = (int *)malloc(kh * kw * sizeof(int)); + int *avg_pooling_mac_b = (int *)malloc(kh * kw * sizeof(int)); + + uint8_t avg_const_weight = *(uint8_t *)weight; + const int8_t *weight_arr = weight; + + int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b); + int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r); + + int output_h = calc_output_hw(h_after, kh, stride_h); + int output_w = calc_output_hw(w_after, kw, stride_w); + + int8_t *i_fmap_pad = NULL; + for (int n = 0; n < input_n; n++) { + if (const_weight == 0) weight_arr = weight; + + for (int c = 0; c < input_c; ++c) { + fill_pad_fmap_int8(i_fmap, &i_fmap_pad, 0, pad_w_l, pad_w_r, pad_h_t, pad_h_b, ins_h, ins_w, + ins_h_last, ins_w_last, input_h, input_w); + for (int ph = 0; ph < output_h; ++ph) { + for (int pw = 0; pw < output_w; ++pw) { + int hstart = ph * stride_h; + int wstart = pw * stride_w; + int pool_index = index_get(ph, output_w, pw); + int mac_index = 0; + int avg_pool_result; + + for (int h = 0; h < kh; h++) { + for (int w = 0; w < kw; w++) { + int index = index_get((hstart + h), w_after, (w + wstart)); + mac_index = index_get(h, kw, w); + avg_pooling_mac_a[mac_index] = + input_sign ? i_fmap_pad[index] : (uint8_t)(i_fmap_pad[index]); + + avg_pooling_mac_b[mac_index] = + const_weight ? avg_const_weight : weight_arr[mac_index]; + } + } + + inner_product(avg_pooling_mac_a, avg_pooling_mac_b, kh * kw, &avg_pool_result); + + if (bias) { + avg_pool_result += bias[c]; + } + + int ret = satu_2_8bit(&avg_pool_result, sizeof(int8_t), o_fmap + pool_index, + r_shift_width, 1, satu_sign); + + if (ret != BM_SUCCESS) { + free(i_fmap_pad); + free(avg_pooling_mac_a); + free(avg_pooling_mac_b); + + return BM_ERR_INVALID_ARGUMENT; + } + } + } + i_fmap += input_w * input_h; + if (const_weight == 0) weight_arr += kh * kw; + + o_fmap += output_w * output_h; + } + } + free(i_fmap_pad); + + free(avg_pooling_mac_a); + free(avg_pooling_mac_b); + + return BM_SUCCESS; +} + +int native_pooling_max_int8(const int8_t *i_fmap, int8_t *o_fmap, int input_n, int input_c, + int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b, + int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h, + int ins_w, int ins_h_last, int ins_w_last, int input_sign) { + if (ins_h != 0 || ins_w != 0 || ins_h_last != 0 || ins_w_last != 0) + return BM_ERR_INVALID_ARGUMENT; + + int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b); + int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r); + + int output_h = calc_output_hw(h_after, kh, stride_h); + int output_w = calc_output_hw(w_after, kw, stride_w); + + const int max_init = input_sign ? -128 : 0; + int8_t *i_fmap_pad = NULL; + for (int nc = 0; nc < input_n * input_c; nc++) { + fill_pad_fmap_int8(i_fmap, &i_fmap_pad, max_init, pad_w_l, pad_w_r, pad_h_t, pad_h_b, 0, 0, 0, + 0, input_h, input_w); + + for (int ph = 0; ph < output_h; ++ph) { + for (int pw = 0; pw < output_w; ++pw) { + int hstart = ph * stride_h; + int wstart = pw * stride_w; + int pool_index = index_get(ph, output_w, pw); + int max = max_init; + for (int h = 0; h < kh; h++) { + for (int w = 0; w < kw; w++) { + int index = index_get((hstart + h), (input_w + pad_w_l + pad_w_r), (w + wstart)); + int val = input_sign ? i_fmap_pad[index] : (uint8_t)i_fmap_pad[index]; + max = (val > max) ? val : max; + } + } + o_fmap[pool_index] = max; + } + } + i_fmap += input_w * input_h; + o_fmap += output_w * output_h; + } + free(i_fmap_pad); + + return BM_SUCCESS; +} + +int native_pooling_max_fp32(const float *ifmap, float *ofmap, int input_n, int input_c, int input_h, + int input_w, int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, + int pad_w_r, int stride_h, int stride_w, int ins_h, int ins_w, + int ins_h_last, int ins_w_last) { + int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b); + int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r); + int output_h = calc_output_hw(h_after, kh, stride_h); + int output_w = calc_output_hw(w_after, kw, stride_w); + float *ifmap_after = malloc(sizeof(float) * h_after * w_after); + + if (ifmap_after == NULL) { + printf("No enough memory[h_after, w_after]: [%u, %u].\n", h_after, w_after); + return BM_ERR_NOMEM; + } + + for (int n = 0; n < input_n; n++) { + for (int c = 0; c < input_c; c++) { + int ret = fill_pad_fmap_fp32(ifmap, &ifmap_after, -FLT_MAX, pad_h_t, pad_h_b, pad_w_l, + pad_w_r, ins_h, ins_w, ins_h_last, ins_w_last, input_h, input_w); + + if (ret != BM_SUCCESS) { + printf("Failed to pad input fmap.\n"); + free(ifmap_after); + return BM_ERR_FAILURE; + } + + for (int h = 0; h < output_h; h++) { + for (int w = 0; w < output_w; w++) { + int rf_h = h * stride_h, rf_w = w * stride_w; + int kh_end = math_min(kh, h_after - rf_h); + int kw_end = math_min(kw, w_after - rf_w); + float *rf_addr = ifmap_after + rf_h * w_after + rf_w; + float max_val = -FLT_MAX; + + for (int i = 0; i < kh_end; i++) { + for (int j = 0; j < kw_end; j++) { + max_val = math_max(rf_addr[i * w_after + j], max_val); + } + } + ofmap[h * output_w + w] = max_val; + } + } + + ifmap += input_h * input_w; + ofmap += output_h * output_w; + } + } + + free(ifmap_after); + return BM_SUCCESS; +} + +int native_pooling_avg_fp32(const float *ifmap, float *ofmap, int input_n, int input_c, int input_h, + int input_w, int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, + int pad_w_r, int stride_h, int stride_w, int ins_h, int ins_w, + int ins_h_last, int ins_w_last, float avg_pooling_const) { + int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b); + int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r); + int output_h = calc_output_hw(h_after, kh, stride_h); + int output_w = calc_output_hw(w_after, kw, stride_w); + float *ifmap_after = malloc(sizeof(float) * h_after * w_after); + + if (ifmap_after == NULL) { + printf("No enough memory[h_after, w_after]: [%u, %u].\n", h_after, w_after); + return BM_ERR_NOMEM; + } + + for (int n = 0; n < input_n; n++) { + for (int c = 0; c < input_c; c++) { + int ret = fill_pad_fmap_fp32(ifmap, &ifmap_after, 0, pad_h_t, pad_h_b, pad_w_l, pad_w_r, + ins_h, ins_w, ins_h_last, ins_w_last, input_h, input_w); + + if (ret != BM_SUCCESS) { + printf("Failed to pad input fmap.\n"); + free(ifmap_after); + return BM_ERR_FAILURE; + } + + for (int h = 0; h < output_h; h++) { + for (int w = 0; w < output_w; w++) { + int rf_h = h * stride_h, rf_w = w * stride_w; + int kh_end = math_min(kh, h_after - rf_h); + int kw_end = math_min(kw, w_after - rf_w); + float *rf_addr = ifmap_after + rf_h * w_after + rf_w; + float dot_product_even = 0.0, dot_product_odd = 0.0; + + for (int i = 0; i < kh_end; i++) { + for (int j = 0; j < kw_end; j++) { + if ((i * kw_end + j) % 2) { + dot_product_odd += rf_addr[i * w_after + j] * avg_pooling_const; + } else { + dot_product_even += rf_addr[i * w_after + j] * avg_pooling_const; + } + } + } + ofmap[h * output_w + w] = dot_product_even + dot_product_odd; + } + } + + ifmap += input_h * input_w; + ofmap += output_h * output_w; + } + } + + free(ifmap_after); + return BM_SUCCESS; +} + +void native_pooling_forward_max(const float *bottom_data, float *top_data, int *mask_data, + const int count, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, + const int pad_w) { + (void)num; + for (int index = 0; index < count; ++index) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + const int hend = math_min(hstart + kernel_h, height); + const int wend = math_min(wstart + kernel_w, width); + hstart = math_max(hstart, 0); + wstart = math_max(wstart, 0); + float maxval = -FLT_MAX; + int maxidx = -1; + const float *const bottom_slice = bottom_data + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + if (bottom_slice[h * width + w] > maxval) { + maxidx = h * width + w; + maxval = bottom_slice[maxidx]; + } + } + } + top_data[index] = maxval; + mask_data[index] = maxidx; + } +} + +void native_pooling_forward_ave(const float *bottom_data, float *top_data, const int count, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, const int pad_h, const int pad_w) { + (void)num; + for (int index = 0; index < count; ++index) { + const int pw = index % pooled_width; + const int ph = (index / pooled_width) % pooled_height; + const int c = (index / pooled_width / pooled_height) % channels; + const int n = index / pooled_width / pooled_height / channels; + int hstart = ph * stride_h - pad_h; + int wstart = pw * stride_w - pad_w; + int hend = math_min(hstart + kernel_h, height + pad_h); + int wend = math_min(wstart + kernel_w, width + pad_w); + const int pool_size = (hend - hstart) * (wend - wstart); + hstart = math_max(hstart, 0); + wstart = math_max(wstart, 0); + hend = math_min(hend, height); + wend = math_min(wend, width); + float aveval = 0; + const float *const bottom_slice = bottom_data + (n * channels + c) * height * width; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + aveval += bottom_slice[h * width + w]; + } + } + top_data[index] = aveval / pool_size; + } +} + +int satu_2_8bit(const int *pBuff, int len, int8_t *pByteOut, int rshiftbits, int round_floor, + int sign_unsign) { + if (rshiftbits < 0) return BM_ERR_INVALID_ARGUMENT; + + int temp; + int satu_max = sign_unsign ? 127 : 255; + int satu_min = sign_unsign ? -128 : 0; + if (rshiftbits == 0) { + for (int ii = 0; ii < len; ii++) { + temp = (pBuff[ii] > satu_max) ? satu_max : ((pBuff[ii] < satu_min) ? satu_min : pBuff[ii]); + memcpy(pByteOut + ii, &temp, 1); + } + } else { // rshiftbits>0 + for (int ii = 0; ii < len; ii++) { + if (round_floor == 1) + temp = ((pBuff[ii] >> (rshiftbits - 1)) + 1) >> 1; + else + temp = pBuff[ii] >> rshiftbits; + temp = (temp > satu_max) ? satu_max : ((temp < satu_min) ? satu_min : temp); + memcpy(pByteOut + ii, &temp, 1); + } + } + + return BM_SUCCESS; +} + +int satu_2_16bit(const int *pBuff, int len, short *pByteOut, int rshiftbits, int round_floor, + int sign_unsign) { + if (rshiftbits < 0) return BM_ERR_INVALID_ARGUMENT; + + int ii; + int temp; + int satu_max = sign_unsign ? 32767 : 65535; + int satu_min = sign_unsign ? -32768 : 0; + if (rshiftbits == 0) { + for (ii = 0; ii < len; ii++) { + temp = (pBuff[ii] > satu_max) ? satu_max : ((pBuff[ii] < satu_min) ? satu_min : pBuff[ii]); + memcpy(pByteOut + ii, &temp, 2); + } + } else { // rshiftbits>0 + for (ii = 0; ii < len; ii++) { + if (round_floor == 1) + temp = ((pBuff[ii] >> (rshiftbits - 1)) + 1) >> 1; + else + temp = pBuff[ii] >> rshiftbits; + temp = (temp > satu_max) ? satu_max : ((temp < satu_min) ? satu_min : temp); + memcpy(pByteOut + ii, &temp, 2); + } + } + + return BM_SUCCESS; +} diff --git a/cvimath/tests/cvi1835/atan.cpp b/cvimath/tests/cvi1835/atan.cpp new file mode 100644 index 000000000..275c6170a --- /dev/null +++ b/cvimath/tests/cvi1835/atan.cpp @@ -0,0 +1,477 @@ +/** + * plz refer [git](https://github.com/xiezhq-hermann/atan_lookup) + * input range is `all real numbers` and output range is -pi/2 < x < pi/2, + * you can refer [here](https://www.mathopenref.com/arctan.html) for more + * details + */ +// +// xiezhq@shanghaitech.edu.cn && wanghe@shanghaitech.edu.cn +/* Reference: + [1] Abhisek Ukil, Vishal H Shah, Bernhard Deck, + "Fast Computation of arctangent Functions for Embedded Applications: A + Comparative Analysis" IEEE International Symposium on Industrial Electronics, + Pages: 1206 - 1211, DOI: 10.1109/ISIE.2011.5984330, 2011 + [2] Sreeraman Rajan, Sichun Wang, Robert Inkol, and Alain Joyal + "Efficient Approximations for the Arctangent Function" + IEEE SIGNAL PROCESSING MAGAZINE [108] MAY 2006 + */ + +#include +#include + +#define OUT +#define IN +#include +#include +#include +#include +#include +#include +//#define DBG + +using namespace std; + +#if 0 +double atan_double(double x) { + /* + More precise look-up table is used for higher accuracy + */ + if (x >= 0) { + if (x <= 1) { + int index = round(x * 100); + return (LUT_d[index] + (x * 100 - index) * (LUT_d[index + 1] - LUT_d[index])); + } else { + double re_x = 1 / x; + int index = round(re_x * 100); + return (M_PI_2 - (LUT_d[index] + (re_x * 100 - index) * (LUT_d[index + 1] - LUT_d[index]))); + // No recursive is better here + } + } else { + if (x >= -1) { + double abs_x = -x; + int index = round(abs_x * 100); + return -(LUT_d[index] + (abs_x * 100 - index) * (LUT_d[index + 1] - LUT_d[index])); + } else { + double re_x = 1 / (-x); + int index = round(re_x * 100); + return (LUT_d[index] + (re_x * 100 - index) * (LUT_d[index+1] - LUT_d[index])) - M_PI_2; + } + } +} +#endif + +/** + * pre_data means we test fixed pattern, it should be same sa lut + */ +enum TEST_MODE { + PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare + DATA_COMPARE_ACCURACY, // generate \range_start to \range_end value that + // check epsilon + DATA_COMPARE_U8, // generate \range_start to \range_end value that check + // epsilon, result bf16->uint8_t + TEST_MODE_MAX, +}; + +static TEST_MODE mode; + +static uint16_t test_pattern[] = { + 0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52, 0x3A6C, 0x3A83, 0x3A90, + 0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF, 0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17, + 0x3B1D, 0x3B24, 0x3B2A, 0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65, + 0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90, 0x3B93, 0x3B97, 0x3B9A, + 0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE, 0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1, + 0x3BC5, 0x3BC8, 0x3BCB, 0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9, + 0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03, 0x3C05, 0x3C06, 0x3C08, + 0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12, 0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C, + 0x3C1D, 0x3C1F, 0x3C21, 0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F, + 0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E, 0x3C40, 0x3C41, 0x3C43, + 0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D, 0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57, + 0x3C58, 0x3C5A, 0x3C5C, 0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A, + 0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79, 0x3C7B, 0x3C7C, 0x3C7E, + 0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84, 0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89, + 0x3C8A, 0x3C8A, 0x3C8B, 0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93, + 0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A, 0x3C9B, 0x3C9C, 0x3C9C, + 0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1, 0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6, + 0x3CA7, 0x3CA8, 0x3CA9, 0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0, + 0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8, 0x3CB8, 0x3CB9, 0x3CBA, + 0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF, 0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4, + 0x3CC5, 0x3CC5, 0x3CC6, 0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE, + 0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5, 0x3CD6, 0x3CD7, 0x3CD7, + 0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC, 0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1, + 0x3CE2, 0x3CE3, 0x3CE4, 0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB, + 0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2, 0x3CF3, 0x3CF4, 0x3CF5, + 0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA, 0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF, + 0x3D00, 0x3D00, 0x3D01, 0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04, + 0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08, 0x3D08, 0x3D09, 0x3D09, + 0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C, 0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E, + 0x3D0F, 0x3D0F, 0x3D0F, 0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13, + 0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17, 0x3D17, 0x3D18, 0x3D18, + 0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A, 0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D, + 0x3D1D, 0x3D1E, 0x3D1E, 0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22, + 0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25, 0x3D26, 0x3D26, 0x3D27, + 0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29, 0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C, + 0x3D2C, 0x3D2C, 0x3D2D, 0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31, + 0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34, 0x3D35, 0x3D35, 0x3D35, + 0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38, 0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A, + 0x3D3B, 0x3D3B, 0x3D3C, 0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F, + 0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43, 0x3D43, 0x3D44, 0x3D44, + 0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47, 0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49, + 0x3D4A, 0x3D4A, 0x3D4A, 0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E, + 0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52, 0x3D52, 0x3D53, 0x3D53, + 0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55, 0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58, + 0x3D58, 0x3D59, 0x3D59, 0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D, + 0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60, 0x3D61, 0x3D61, 0x3D62, + 0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64, 0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67, + 0x3D67, 0x3D67, 0x3D68, 0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C, + 0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F, 0x3D70, 0x3D70, 0x3D70, + 0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73, 0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75, + 0x3D76, 0x3D76, 0x3D77, 0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A, + 0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E, 0x3D7E, 0x3D7F, 0x3D7F, + 0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82, + 0x3D82, 0x3D82, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85, + 0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D87, 0x3D87, 0x3D87, + 0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89, + 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C, + 0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, + 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91, + 0x3D91, 0x3D91, 0x3D91, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93, + 0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95, 0x3D95, 0x3D96, 0x3D96, + 0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98, + 0x3D98, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B, + 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9D, 0x3D9D, 0x3D9D, + 0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0, + 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2, + 0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, + 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7, + 0x3DA7, 0x3DA7, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, + 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAC, 0x3DAC, + 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, + 0x3DAE, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1, + 0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3, + 0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6, + 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8, + 0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBB, + 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD, + 0x3DBD, 0x3DBD, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, + 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC2, 0x3DC2, 0x3DC2, + 0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, + 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7, + 0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9, 0x3DC9, 0x3DC9, 0x3DC9, + 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC, + 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5, + 0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE, 0x3DDF, 0x3DE0, 0x3DE1, + 0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5, +}; + +static uint16_t golden_bf16[] = { + 0x0, 0x38d2, 0x3952, 0x399d, 0x39d2, 0x3a03, 0x3a1d, 0x3a38, 0x3a52, 0x3a6c, 0x3a83, 0x3a90, + 0x3a9d, 0x3aaa, 0x3ab8, 0x3ac5, 0x3ad2, 0x3adf, 0x3aec, 0x3afa, 0x3b03, 0x3b0a, 0x3b10, 0x3b17, + 0x3b1d, 0x3b24, 0x3b2a, 0x3b31, 0x3b38, 0x3b3e, 0x3b45, 0x3b4c, 0x3b52, 0x3b59, 0x3b5f, 0x3b65, + 0x3b6c, 0x3b72, 0x3b7a, 0x3b80, 0x3b83, 0x3b86, 0x3b8a, 0x3b8d, 0x3b90, 0x3b93, 0x3b97, 0x3b9a, + 0x3b9d, 0x3ba1, 0x3ba4, 0x3ba7, 0x3baa, 0x3bae, 0x3bb1, 0x3bb4, 0x3bb8, 0x3bbb, 0x3bbe, 0x3bc1, + 0x3bc5, 0x3bc8, 0x3bcb, 0x3bce, 0x3bd2, 0x3bd6, 0x3bd8, 0x3bdc, 0x3bdf, 0x3be2, 0x3be6, 0x3be9, + 0x3bec, 0x3bef, 0x3bf2, 0x3bf6, 0x3bf9, 0x3bfc, 0x3c00, 0x3c01, 0x3c03, 0x3c05, 0x3c06, 0x3c08, + 0x3c0a, 0x3c0b, 0x3c0d, 0x3c0f, 0x3c10, 0x3c12, 0x3c13, 0x3c15, 0x3c17, 0x3c18, 0x3c1a, 0x3c1c, + 0x3c1d, 0x3c1f, 0x3c21, 0x3c22, 0x3c24, 0x3c25, 0x3c27, 0x3c29, 0x3c2a, 0x3c2c, 0x3c2e, 0x3c2f, + 0x3c31, 0x3c33, 0x3c34, 0x3c36, 0x3c38, 0x3c39, 0x3c3b, 0x3c3c, 0x3c3e, 0x3c40, 0x3c41, 0x3c43, + 0x3c45, 0x3c46, 0x3c48, 0x3c4a, 0x3c4b, 0x3c4d, 0x3c4e, 0x3c50, 0x3c52, 0x3c53, 0x3c55, 0x3c57, + 0x3c58, 0x3c5a, 0x3c5c, 0x3c5d, 0x3c5f, 0x3c60, 0x3c62, 0x3c64, 0x3c66, 0x3c68, 0x3c69, 0x3c6a, + 0x3c6c, 0x3c6e, 0x3c70, 0x3c71, 0x3c72, 0x3c74, 0x3c76, 0x3c78, 0x3c79, 0x3c7b, 0x3c7c, 0x3c7e, + 0x3c80, 0x3c81, 0x3c81, 0x3c82, 0x3c83, 0x3c84, 0x3c85, 0x3c86, 0x3c86, 0x3c87, 0x3c88, 0x3c89, + 0x3c8a, 0x3c8a, 0x3c8b, 0x3c8c, 0x3c8d, 0x3c8e, 0x3c8f, 0x3c8f, 0x3c90, 0x3c91, 0x3c92, 0x3c93, + 0x3c93, 0x3c94, 0x3c95, 0x3c96, 0x3c97, 0x3c98, 0x3c98, 0x3c99, 0x3c9a, 0x3c9b, 0x3c9c, 0x3c9c, + 0x3c9d, 0x3c9e, 0x3c9f, 0x3ca0, 0x3ca1, 0x3ca1, 0x3ca2, 0x3ca3, 0x3ca4, 0x3ca5, 0x3ca5, 0x3ca6, + 0x3ca7, 0x3ca8, 0x3ca9, 0x3caa, 0x3caa, 0x3cab, 0x3cac, 0x3cad, 0x3cae, 0x3cae, 0x3caf, 0x3cb0, + 0x3cb1, 0x3cb2, 0x3cb3, 0x3cb3, 0x3cb4, 0x3cb5, 0x3cb6, 0x3cb7, 0x3cb8, 0x3cb8, 0x3cb9, 0x3cba, + 0x3cbb, 0x3cbc, 0x3cbc, 0x3cbd, 0x3cbe, 0x3cbf, 0x3cc0, 0x3cc1, 0x3cc1, 0x3cc2, 0x3cc3, 0x3cc4, + 0x3cc5, 0x3cc5, 0x3cc6, 0x3cc7, 0x3cc8, 0x3cc9, 0x3cca, 0x3cca, 0x3ccb, 0x3ccc, 0x3ccd, 0x3cce, + 0x3cce, 0x3ccf, 0x3cd0, 0x3cd1, 0x3cd2, 0x3cd3, 0x3cd3, 0x3cd4, 0x3cd5, 0x3cd6, 0x3cd7, 0x3cd7, + 0x3cd8, 0x3cd9, 0x3cda, 0x3cdb, 0x3cdc, 0x3cdc, 0x3cdd, 0x3cde, 0x3cdf, 0x3ce0, 0x3ce0, 0x3ce1, + 0x3ce2, 0x3ce3, 0x3ce4, 0x3ce5, 0x3ce5, 0x3ce6, 0x3ce7, 0x3ce8, 0x3ce9, 0x3ce9, 0x3cea, 0x3ceb, + 0x3cec, 0x3ced, 0x3cee, 0x3cee, 0x3cef, 0x3cf0, 0x3cf1, 0x3cf2, 0x3cf2, 0x3cf3, 0x3cf4, 0x3cf5, + 0x3cf6, 0x3cf7, 0x3cf7, 0x3cf8, 0x3cf9, 0x3cfa, 0x3cfb, 0x3cfb, 0x3cfc, 0x3cfd, 0x3cfe, 0x3cff, + 0x3d00, 0x3d00, 0x3d01, 0x3d01, 0x3d01, 0x3d02, 0x3d02, 0x3d03, 0x3d03, 0x3d03, 0x3d04, 0x3d04, + 0x3d05, 0x3d05, 0x3d06, 0x3d06, 0x3d06, 0x3d07, 0x3d07, 0x3d08, 0x3d08, 0x3d08, 0x3d09, 0x3d09, + 0x3d0a, 0x3d0a, 0x3d0a, 0x3d0b, 0x3d0b, 0x3d0c, 0x3d0c, 0x3d0c, 0x3d0d, 0x3d0d, 0x3d0e, 0x3d0e, + 0x3d0f, 0x3d0f, 0x3d0f, 0x3d10, 0x3d10, 0x3d11, 0x3d11, 0x3d11, 0x3d12, 0x3d12, 0x3d13, 0x3d13, + 0x3d13, 0x3d14, 0x3d14, 0x3d15, 0x3d15, 0x3d16, 0x3d16, 0x3d16, 0x3d17, 0x3d17, 0x3d18, 0x3d18, + 0x3d18, 0x3d19, 0x3d19, 0x3d1a, 0x3d1a, 0x3d1a, 0x3d1b, 0x3d1b, 0x3d1c, 0x3d1c, 0x3d1c, 0x3d1d, + 0x3d1d, 0x3d1e, 0x3d1e, 0x3d1f, 0x3d1f, 0x3d1f, 0x3d20, 0x3d20, 0x3d21, 0x3d21, 0x3d21, 0x3d22, + 0x3d22, 0x3d23, 0x3d23, 0x3d23, 0x3d24, 0x3d24, 0x3d25, 0x3d25, 0x3d25, 0x3d26, 0x3d26, 0x3d27, + 0x3d27, 0x3d28, 0x3d28, 0x3d28, 0x3d29, 0x3d29, 0x3d2a, 0x3d2a, 0x3d2a, 0x3d2b, 0x3d2b, 0x3d2c, + 0x3d2c, 0x3d2c, 0x3d2d, 0x3d2d, 0x3d2e, 0x3d2e, 0x3d2e, 0x3d2f, 0x3d2f, 0x3d30, 0x3d30, 0x3d31, + 0x3d31, 0x3d31, 0x3d32, 0x3d32, 0x3d33, 0x3d33, 0x3d33, 0x3d34, 0x3d34, 0x3d35, 0x3d35, 0x3d35, + 0x3d36, 0x3d36, 0x3d37, 0x3d37, 0x3d38, 0x3d38, 0x3d38, 0x3d39, 0x3d39, 0x3d3a, 0x3d3a, 0x3d3a, + 0x3d3b, 0x3d3b, 0x3d3c, 0x3d3c, 0x3d3c, 0x3d3d, 0x3d3d, 0x3d3e, 0x3d3e, 0x3d3e, 0x3d3f, 0x3d3f, + 0x3d40, 0x3d40, 0x3d41, 0x3d41, 0x3d41, 0x3d42, 0x3d42, 0x3d43, 0x3d43, 0x3d43, 0x3d44, 0x3d44, + 0x3d45, 0x3d45, 0x3d45, 0x3d46, 0x3d46, 0x3d47, 0x3d47, 0x3d47, 0x3d48, 0x3d48, 0x3d49, 0x3d49, + 0x3d4a, 0x3d4a, 0x3d4a, 0x3d4b, 0x3d4b, 0x3d4c, 0x3d4c, 0x3d4c, 0x3d4d, 0x3d4d, 0x3d4e, 0x3d4e, + 0x3d4e, 0x3d4f, 0x3d4f, 0x3d50, 0x3d50, 0x3d50, 0x3d51, 0x3d51, 0x3d52, 0x3d52, 0x3d53, 0x3d53, + 0x3d53, 0x3d54, 0x3d54, 0x3d55, 0x3d55, 0x3d55, 0x3d56, 0x3d56, 0x3d57, 0x3d57, 0x3d57, 0x3d58, + 0x3d58, 0x3d59, 0x3d59, 0x3d59, 0x3d5a, 0x3d5a, 0x3d5b, 0x3d5b, 0x3d5c, 0x3d5c, 0x3d5c, 0x3d5d, + 0x3d5d, 0x3d5e, 0x3d5e, 0x3d5e, 0x3d5f, 0x3d5f, 0x3d60, 0x3d60, 0x3d60, 0x3d60, 0x3d60, 0x3d61, + 0x3d61, 0x3d62, 0x3d62, 0x3d62, 0x3d63, 0x3d63, 0x3d64, 0x3d64, 0x3d64, 0x3d65, 0x3d65, 0x3d66, + 0x3d66, 0x3d66, 0x3d67, 0x3d67, 0x3d68, 0x3d68, 0x3d68, 0x3d69, 0x3d69, 0x3d6a, 0x3d6a, 0x3d6b, + 0x3d6b, 0x3d6b, 0x3d6c, 0x3d6c, 0x3d6d, 0x3d6d, 0x3d6d, 0x3d6e, 0x3d6e, 0x3d6f, 0x3d6f, 0x3d6f, + 0x3d70, 0x3d70, 0x3d71, 0x3d71, 0x3d71, 0x3d72, 0x3d72, 0x3d73, 0x3d73, 0x3d74, 0x3d74, 0x3d74, + 0x3d75, 0x3d75, 0x3d76, 0x3d76, 0x3d76, 0x3d77, 0x3d77, 0x3d78, 0x3d78, 0x3d78, 0x3d79, 0x3d79, + 0x3d7a, 0x3d7a, 0x3d7a, 0x3d7b, 0x3d7b, 0x3d7c, 0x3d7c, 0x3d7d, 0x3d7d, 0x3d7d, 0x3d7e, 0x3d7e, + 0x3d7f, 0x3d7f, 0x3d7f, 0x3d7f, 0x3d81, 0x3d81, 0x3d81, 0x3d81, 0x3d81, 0x3d82, 0x3d82, 0x3d82, + 0x3d82, 0x3d82, 0x3d83, 0x3d83, 0x3d83, 0x3d83, 0x3d83, 0x3d84, 0x3d84, 0x3d84, 0x3d84, 0x3d85, + 0x3d85, 0x3d85, 0x3d85, 0x3d85, 0x3d86, 0x3d86, 0x3d86, 0x3d86, 0x3d86, 0x3d87, 0x3d87, 0x3d87, + 0x3d87, 0x3d87, 0x3d88, 0x3d88, 0x3d88, 0x3d88, 0x3d88, 0x3d89, 0x3d89, 0x3d89, 0x3d89, 0x3d89, + 0x3d8a, 0x3d8a, 0x3d8a, 0x3d8a, 0x3d8a, 0x3d8b, 0x3d8b, 0x3d8b, 0x3d8b, 0x3d8b, 0x3d8c, 0x3d8c, + 0x3d8c, 0x3d8c, 0x3d8c, 0x3d8d, 0x3d8d, 0x3d8d, 0x3d8d, 0x3d8e, 0x3d8e, 0x3d8e, 0x3d8e, 0x3d8e, + 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d90, 0x3d90, 0x3d90, 0x3d90, 0x3d90, 0x3d91, 0x3d91, + 0x3d91, 0x3d91, 0x3d91, 0x3d92, 0x3d92, 0x3d92, 0x3d92, 0x3d92, 0x3d93, 0x3d93, 0x3d93, 0x3d93, + 0x3d93, 0x3d94, 0x3d94, 0x3d94, 0x3d94, 0x3d94, 0x3d95, 0x3d95, 0x3d95, 0x3d95, 0x3d96, 0x3d96, + 0x3d96, 0x3d96, 0x3d96, 0x3d97, 0x3d97, 0x3d97, 0x3d97, 0x3d97, 0x3d98, 0x3d98, 0x3d98, 0x3d98, + 0x3d98, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d9a, + 0x3d9a, 0x3d9a, 0x3d9a, 0x3d9a, 0x3d9b, 0x3d9b, 0x3d9b, 0x3d9b, 0x3d9b, 0x3d9c, 0x3d9c, 0x3d9c, + 0x3d9c, 0x3d9c, 0x3d9d, 0x3d9d, 0x3d9d, 0x3d9d, 0x3d9e, 0x3d9e, 0x3d9e, 0x3d9e, 0x3d9e, 0x3d9f, + 0x3d9f, 0x3d9f, 0x3d9f, 0x3d9f, 0x3da0, 0x3da0, 0x3da0, 0x3da0, 0x3da0, 0x3da1, 0x3da1, 0x3da1, + 0x3da1, 0x3da1, 0x3da2, 0x3da2, 0x3da2, 0x3da2, 0x3da2, 0x3da3, 0x3da3, 0x3da3, 0x3da3, 0x3da3, + 0x3da4, 0x3da4, 0x3da4, 0x3da4, 0x3da4, 0x3da5, 0x3da5, 0x3da5, 0x3da5, 0x3da6, 0x3da6, 0x3da6, + 0x3da6, 0x3da6, 0x3da7, 0x3da7, 0x3da7, 0x3da7, 0x3da7, 0x3da8, 0x3da8, 0x3da8, 0x3da8, 0x3da8, + 0x3da9, 0x3da9, 0x3da9, 0x3da9, 0x3da9, 0x3daa, 0x3daa, 0x3daa, 0x3daa, 0x3daa, 0x3dab, 0x3dab, + 0x3dab, 0x3dab, 0x3dab, 0x3dac, 0x3dac, 0x3dac, 0x3dac, 0x3dac, 0x3dad, 0x3dad, 0x3dad, 0x3dad, + 0x3dad, 0x3daf, 0x3daf, 0x3daf, 0x3daf, 0x3db0, 0x3db0, 0x3db0, 0x3db0, 0x3db0, 0x3db1, 0x3db1, + 0x3db1, 0x3db1, 0x3db1, 0x3db2, 0x3db2, 0x3db2, 0x3db2, 0x3db2, 0x3db3, 0x3db3, 0x3db3, 0x3db3, + 0x3db3, 0x3db4, 0x3db4, 0x3db4, 0x3db4, 0x3db4, 0x3db5, 0x3db5, 0x3db5, 0x3db5, 0x3db5, 0x3db6, + 0x3db6, 0x3db6, 0x3db6, 0x3db6, 0x3db7, 0x3db7, 0x3db7, 0x3db7, 0x3db8, 0x3db8, 0x3db8, 0x3db8, + 0x3db8, 0x3db9, 0x3db9, 0x3db9, 0x3db9, 0x3db9, 0x3dba, 0x3dba, 0x3dba, 0x3dba, 0x3dba, 0x3dbb, + 0x3dbb, 0x3dbb, 0x3dbb, 0x3dbb, 0x3dbc, 0x3dbc, 0x3dbc, 0x3dbc, 0x3dbc, 0x3dbd, 0x3dbd, 0x3dbd, + 0x3dbd, 0x3dbd, 0x3dbe, 0x3dbe, 0x3dbe, 0x3dbe, 0x3dbe, 0x3dbf, 0x3dbf, 0x3dbf, 0x3dbf, 0x3dbf, + 0x3dc0, 0x3dc0, 0x3dc0, 0x3dc0, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1, + 0x3dc1, 0x3dc1, 0x3dc2, 0x3dc2, 0x3dc2, 0x3dc2, 0x3dc2, 0x3dc3, 0x3dc3, 0x3dc3, 0x3dc3, 0x3dc3, + 0x3dc4, 0x3dc4, 0x3dc4, 0x3dc4, 0x3dc4, 0x3dc5, 0x3dc5, 0x3dc5, 0x3dc5, 0x3dc5, 0x3dc6, 0x3dc6, + 0x3dc6, 0x3dc6, 0x3dc6, 0x3dc7, 0x3dc7, 0x3dc7, 0x3dc7, 0x3dc7, 0x3dc8, 0x3dc8, 0x3dc8, 0x3dc8, + 0x3dc9, 0x3dc9, 0x3dc9, 0x3dc9, 0x3dc9, 0x3dca, 0x3dca, 0x3dca, 0x3dca, 0x3dca, 0x3dcb, 0x3dcb, + 0x3dcb, 0x3dcb, 0x3dcb, 0x3dcc, 0x3dcd, 0x3dce, 0x3dcf, 0x3dd0, 0x3dd1, 0x3dd2, 0x3dd3, 0x3dd4, + 0x3dd5, 0x3dd6, 0x3dd7, 0x3dd8, 0x3dd9, 0x3dda, 0x3ddb, 0x3ddc, 0x3ddd, 0x3dde, 0x3ddf, 0x3de0, + 0x3de1, 0x3de2, 0x3de3, 0x3de4, +}; + +// dist(range_start, range_end); + int table_hw = 256; + for (uint64_t i = 0; i < ifmap_size; i++) { + // input range is -8 ~ +8 + float input = + ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002; + // float input = ((int)i % 10) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % + // table_hw) * 0.002; float input = dist(e2); input = ((int)i % + // (range_end-2)) * (((int)i % 2) ? 1 : 1) + 0.03 + (i % table_hw) * + // 0.002; if (input < 1 && input > 0) { + // input = 111.9; + //} + input_data[i] = convert_fp32_bf16(input); + } + input_data[0] = convert_fp32_bf16(0); + input_data[1] = convert_fp32_bf16(1); + input_data[2] = convert_fp32_bf16(-1); + } + +#ifdef DBG + for (uint64_t i = 0; i < ifmap_size; i++) { + printf("source if[%lu] bf16 %f 0x%x, log2f is %f\n", i, convert_bf16_fp32(input_data[i]), + input_data[i], floor(log2((convert_bf16_fp32(input_data[i]))))); + } +#endif /* ifdef DBG */ +} + +static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) { + // TODO: check more shape / align + cvk_chip_info_t chip_info = bmk->info; + + uint32_t input_n = 1; + uint32_t input_c = chip_info.npu_num; + uint32_t input_h = 16; + uint32_t input_w = 16; + float epsilon = 0.01; + int range_start = -8; + int range_end = 8; + cvk_fmt_t fmt = CVK_FMT_BF16; + + if (mode == PRE_DATA_COMPARE_FIX) { + input_h = 4; + input_w = 8; + } + + cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w}; + cvk_tl_shape_t ofmap_shape = ifmap_shape; + uint64_t ifmap_size = tl_shape_size(&ifmap_shape); + uint64_t ofmap_size = tl_shape_size(&ofmap_shape); + + int data_type_size = bytesize_of_fmt(fmt); + uint64_t ifmap_bytesize = ifmap_size * data_type_size; + uint64_t ofmap_bytesize = ofmap_size * data_type_size; + + // get lut table shape and size + cvk_tl_shape_t table_shape; + uint64_t table_bytesize = cvm_lut_tbl_bytesize(bmk, &table_shape, fmt); + + cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1); + cvk_tl_t *out = tl_ofmap_bf16; + + // atan buf + cvk_tl_t *tl_y0_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_slope_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_invert_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_pos_neg_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + + // reciprocal buf + cvk_tl_t *tl_reciprocal_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_reciprocal_table_answer_mantissa = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + + // temp buf + cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_buf2 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1); + cvk_tl_t *tl_buf4 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1); + + uint16_t *input_data = (uint16_t *)xmalloc(ifmap_bytesize); + uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize); + + // for reciprocal + uint16_t *table_reciprocal_data = (uint16_t *)xmalloc(table_bytesize); + uint16_t *table_reciprocal_data_mantissa = (uint16_t *)xmalloc(table_bytesize); + + // for atan + uint16_t *table_data_atan_y0 = (uint16_t *)xmalloc(table_bytesize); + uint16_t *table_data_atan_slope = (uint16_t *)xmalloc(table_bytesize); + uint16_t *table_data_atan_invert = (uint16_t *)xmalloc(table_bytesize); + uint16_t *table_data_atan_pos_neg = (uint16_t *)xmalloc(table_bytesize); + + gen_input(input_data, ifmap_size, mode, range_start, range_end); + tl_lut_ref(ref_data, input_data, ifmap_shape); + + cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape); + cvm_atan_tbl(table_data_atan_y0, table_data_atan_slope, table_data_atan_invert, + table_data_atan_pos_neg, &table_shape); + + test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)input_data); + test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer, (uint8_t *)table_reciprocal_data); + test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer_mantissa, + (uint8_t *)table_reciprocal_data_mantissa); + + // prepare atan + test_put_tensor_g2l_comp(ctx, bmk, tl_y0_buf, (uint8_t *)table_data_atan_y0); + test_put_tensor_g2l_comp(ctx, bmk, tl_slope_buf, (uint8_t *)table_data_atan_slope); + test_put_tensor_g2l_comp(ctx, bmk, tl_invert_buf, (uint8_t *)table_data_atan_invert); + test_put_tensor_g2l_comp(ctx, bmk, tl_pos_neg_buf, (uint8_t *)table_data_atan_pos_neg); + + cvm_atan_emit(bmk, tl_ifmap, tl_buf, tl_buf2, tl_buf4, tl_y0_buf, tl_slope_buf, tl_invert_buf, + tl_pos_neg_buf, tl_reciprocal_table_answer, tl_reciprocal_table_answer_mantissa, + tl_ofmap_bf16, fmt); + + test_submit_comp(ctx, bmk); + + uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, out); + verify(ofmap_data, ref_data, input_data, ifmap_size, epsilon); + + free_tl(bmk, tl_buf4); + free_tl(bmk, tl_buf2); + free_tl(bmk, tl_buf); + free_tl(bmk, tl_reciprocal_table_answer_mantissa); + free_tl(bmk, tl_reciprocal_table_answer); + free_tl(bmk, tl_pos_neg_buf); + free_tl(bmk, tl_invert_buf); + free_tl(bmk, tl_slope_buf); + free_tl(bmk, tl_y0_buf); + free_tl(bmk, tl_ofmap_bf16); + free_tl(bmk, tl_ifmap); + + free(table_data_atan_y0); + free(table_data_atan_slope); + free(table_data_atan_invert); + free(table_data_atan_pos_neg); + free(table_reciprocal_data); + free(table_reciprocal_data_mantissa); + free(input_data); + free(ref_data); + free(ofmap_data); +} + +int main() { + cvk_context_t *bmk = NULL; + int round_mode; + round_mode = set_store_feround(); + + CVI_RT_HANDLE ctx; + test_init(&ctx, &bmk); + + // for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) + // for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_ACCURACY; i++) + for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_U8; i++) + // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_U8; i++) + { + mode = static_cast(i); + printf("test mode %d...\n", mode); + testbench(&ctx, bmk); + } + printf("pass\n"); + + test_exit(&ctx, bmk); + restore_feround(round_mode); + return 0; +} diff --git a/cvimath/tests/cvi1835/atan2_degree.cpp b/cvimath/tests/cvi1835/atan2_degree.cpp new file mode 100644 index 000000000..f785c782c --- /dev/null +++ b/cvimath/tests/cvi1835/atan2_degree.cpp @@ -0,0 +1,667 @@ +/** + * \breif atan2 is implemented by atan, you can refer + * [wiki](https://en.wikipedia.org/wiki/Atan2) for more details + */ + +#include +#include + +#define OUT +#define IN +#include +#include +#include +#include +#include +#include +//#define DBG + +/** + * pre_data means we test fixed pattern, it should be same sa lut + */ +enum TEST_MODE { + PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare + DATA_COMPARE_ACCURACY, // generate \range_start to \range_end value that + // check epsilon, default set x > 0, y > 0 + + DATA_COMPARE_ACCURACY_X_GT_0, // atan(y/x), x > 0, y = 0 + DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0, // atan(y/x) + PI , x < 0 and y >= 0 + DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0, // atan(y/x) - PI , x < 0 and y < 0 + DATA_COMPARE_ACCURACY_X_0_Y_GT_0, // pi / 2, x = 0 and y > 0 + DATA_COMPARE_ACCURACY_X_0_Y_LT_0, // -pi / 2, x = 0 and y < 0 + DATA_COMPARE_U8, // generate \range_start to \range_end value that check + // epsilon, result bf16->uint8_t + TEST_MODE_MAX, +}; + +static TEST_MODE mode; + +static uint16_t test_pattern[] = { + 0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52, 0x3A6C, 0x3A83, 0x3A90, + 0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF, 0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17, + 0x3B1D, 0x3B24, 0x3B2A, 0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65, + 0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90, 0x3B93, 0x3B97, 0x3B9A, + 0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE, 0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1, + 0x3BC5, 0x3BC8, 0x3BCB, 0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9, + 0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03, 0x3C05, 0x3C06, 0x3C08, + 0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12, 0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C, + 0x3C1D, 0x3C1F, 0x3C21, 0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F, + 0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E, 0x3C40, 0x3C41, 0x3C43, + 0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D, 0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57, + 0x3C58, 0x3C5A, 0x3C5C, 0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A, + 0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79, 0x3C7B, 0x3C7C, 0x3C7E, + 0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84, 0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89, + 0x3C8A, 0x3C8A, 0x3C8B, 0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93, + 0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A, 0x3C9B, 0x3C9C, 0x3C9C, + 0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1, 0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6, + 0x3CA7, 0x3CA8, 0x3CA9, 0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0, + 0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8, 0x3CB8, 0x3CB9, 0x3CBA, + 0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF, 0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4, + 0x3CC5, 0x3CC5, 0x3CC6, 0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE, + 0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5, 0x3CD6, 0x3CD7, 0x3CD7, + 0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC, 0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1, + 0x3CE2, 0x3CE3, 0x3CE4, 0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB, + 0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2, 0x3CF3, 0x3CF4, 0x3CF5, + 0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA, 0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF, + 0x3D00, 0x3D00, 0x3D01, 0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04, + 0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08, 0x3D08, 0x3D09, 0x3D09, + 0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C, 0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E, + 0x3D0F, 0x3D0F, 0x3D0F, 0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13, + 0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17, 0x3D17, 0x3D18, 0x3D18, + 0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A, 0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D, + 0x3D1D, 0x3D1E, 0x3D1E, 0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22, + 0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25, 0x3D26, 0x3D26, 0x3D27, + 0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29, 0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C, + 0x3D2C, 0x3D2C, 0x3D2D, 0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31, + 0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34, 0x3D35, 0x3D35, 0x3D35, + 0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38, 0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A, + 0x3D3B, 0x3D3B, 0x3D3C, 0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F, + 0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43, 0x3D43, 0x3D44, 0x3D44, + 0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47, 0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49, + 0x3D4A, 0x3D4A, 0x3D4A, 0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E, + 0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52, 0x3D52, 0x3D53, 0x3D53, + 0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55, 0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58, + 0x3D58, 0x3D59, 0x3D59, 0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D, + 0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60, 0x3D61, 0x3D61, 0x3D62, + 0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64, 0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67, + 0x3D67, 0x3D67, 0x3D68, 0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C, + 0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F, 0x3D70, 0x3D70, 0x3D70, + 0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73, 0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75, + 0x3D76, 0x3D76, 0x3D77, 0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A, + 0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E, 0x3D7E, 0x3D7F, 0x3D7F, + 0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82, + 0x3D82, 0x3D82, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85, + 0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D87, 0x3D87, 0x3D87, + 0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89, + 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C, + 0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, + 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91, + 0x3D91, 0x3D91, 0x3D91, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93, + 0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95, 0x3D95, 0x3D96, 0x3D96, + 0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98, + 0x3D98, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B, + 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9D, 0x3D9D, 0x3D9D, + 0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0, + 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2, + 0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, + 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7, + 0x3DA7, 0x3DA7, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, + 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAC, 0x3DAC, + 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, + 0x3DAE, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1, + 0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3, + 0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6, + 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8, + 0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBB, + 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD, + 0x3DBD, 0x3DBD, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, + 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC2, 0x3DC2, 0x3DC2, + 0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, + 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7, + 0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9, 0x3DC9, 0x3DC9, 0x3DC9, + 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC, + 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5, + 0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE, 0x3DDF, 0x3DE0, 0x3DE1, + 0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5, +}; + +static uint16_t golden_bf16[] = { + 0x42b4, 0x42b4, 0x42b4, 0x42b4, 0x42b4, 0x42b4, 0x42b3, 0x42b3, 0x42b3, 0x42b3, 0x42b3, 0x42b3, + 0x42b3, 0x42b3, 0x42b3, 0x42b3, 0x42b2, 0x42b2, 0x42b2, 0x42b2, 0x42b2, 0x42b2, 0x42b2, 0x42b2, + 0x42b2, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42af, + 0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42ae, 0x42ae, 0x42ae, + 0x42ae, 0x42ae, 0x42ae, 0x42ae, 0x42ae, 0x42ae, 0x42ad, 0x42ad, 0x42ad, 0x42ad, 0x42ad, 0x42ad, + 0x42ad, 0x42ad, 0x42ad, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, + 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42aa, 0x42aa, 0x42aa, + 0x42aa, 0x42aa, 0x42aa, 0x42aa, 0x42aa, 0x42a9, 0x42a9, 0x42a9, 0x42a9, 0x42a9, 0x42a9, 0x42a9, + 0x42a9, 0x42a9, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a6, 0x42a6, + 0x42a6, 0x42a6, 0x42a6, 0x42a6, 0x42a6, 0x42a6, 0x42a5, 0x42a5, 0x42a5, 0x42a5, 0x42a5, 0x42a5, + 0x42a5, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a3, 0x42a3, + 0x42a3, 0x42a3, 0x42a3, 0x42a3, 0x42a3, 0x42a2, 0x42a2, 0x42a2, 0x42a2, 0x42a2, 0x42a2, 0x42a2, + 0x42a2, 0x42a2, 0x42a1, 0x42a1, 0x42a1, 0x42a1, 0x42a1, 0x42a1, 0x42a1, 0x42a0, 0x42a0, 0x42a0, + 0x42a0, 0x42a0, 0x42a0, 0x42a0, 0x42a0, 0x429e, 0x429e, 0x429e, 0x429e, 0x429e, 0x429e, 0x429d, + 0x429d, 0x429d, 0x429d, 0x429d, 0x429d, 0x429d, 0x429d, 0x429d, 0x429c, 0x429c, 0x429c, 0x429c, + 0x429c, 0x429b, 0x429b, 0x429b, 0x429b, 0x429b, 0x429b, 0x429b, 0x429b, 0x429a, 0x429a, 0x429a, + 0x429a, 0x429a, 0x429a, 0x4299, 0x4299, 0x4299, 0x4299, 0x4299, 0x4299, 0x4299, 0x4299, 0x4298, + 0x4298, 0x4298, 0x4298, 0x4298, 0x4298, 0x4297, 0x4297, 0x4297, 0x4297, 0x4297, 0x4297, 0x4296, + 0x4296, 0x4296, 0x4296, 0x4296, 0x4295, 0x4295, 0x4295, 0x4295, 0x4295, 0x4295, 0x4295, 0x4295, + 0x4294, 0x4294, 0x4294, 0x4294, 0x4294, 0x4294, 0x4293, 0x4293, 0x4293, 0x4293, 0x4293, 0x4293, + 0x4292, 0x4292, 0x4292, 0x4292, 0x4292, 0x4291, 0x4291, 0x4291, 0x4291, 0x4291, 0x4291, 0x4291, + 0x4291, 0x428f, 0x428f, 0x428f, 0x428e, 0x428e, 0x428e, 0x428e, 0x428e, 0x428e, 0x428e, 0x428e, + 0x428d, 0x428d, 0x428d, 0x428d, 0x428c, 0x428c, 0x428c, 0x428c, 0x428c, 0x428c, 0x428c, 0x428c, + 0x428b, 0x428b, 0x428b, 0x428a, 0x428a, 0x428a, 0x428a, 0x428a, 0x428a, 0x428a, 0x4289, 0x4289, + 0x4289, 0x4288, 0x4288, 0x4288, 0x4288, 0x4288, 0x4288, 0x4287, 0x4287, 0x4287, 0x4287, 0x4287, + 0x4286, 0x4286, 0x4286, 0x4286, 0x4286, 0x4286, 0x4286, 0x4286, 0x4285, 0x4285, 0x4285, 0x4285, + 0x4285, 0x4285, 0x4285, 0x4285, 0x4285, 0x4284, 0x4284, 0x4284, 0x4284, 0x4284, 0x4283, 0x4283, + 0x4282, 0x4282, 0x4282, 0x4282, 0x4282, 0x4281, 0x4281, 0x4281, 0x4281, 0x4281, 0x4281, 0x4281, + 0x4280, 0x4280, 0x4280, 0x427e, 0x427e, 0x427e, 0x427e, 0x427e, 0x427c, 0x427c, 0x427c, 0x427a, + 0x427a, 0x427a, 0x427a, 0x427a, 0x427a, 0x4278, 0x4278, 0x4278, 0x4277, 0x4277, 0x4277, 0x4277, + 0x4277, 0x4277, 0x4275, 0x4275, 0x4275, 0x4273, 0x4273, 0x4273, 0x4273, 0x4273, 0x4271, 0x4271, + 0x4271, 0x4270, 0x4270, 0x4270, 0x4270, 0x4270, 0x4270, 0x4270, 0x426e, 0x426c, 0x426c, 0x426c, + 0x426c, 0x426c, 0x426a, 0x426a, 0x426a, 0x426a, 0x4269, 0x4269, 0x4269, 0x4269, 0x4269, 0x4267, + 0x4267, 0x4266, 0x4266, 0x4266, 0x4266, 0x4266, 0x4264, 0x4264, 0x4264, 0x4262, 0x4262, 0x4262, + 0x4262, 0x4261, 0x4261, 0x4261, 0x425f, 0x425f, 0x425f, 0x425f, 0x425f, 0x425e, 0x425e, 0x425c, + 0x425c, 0x425c, 0x425c, 0x425c, 0x425b, 0x425b, 0x425b, 0x4259, 0x4259, 0x4259, 0x4259, 0x4257, + 0x4257, 0x4257, 0x4256, 0x4256, 0x4256, 0x4256, 0x4256, 0x4253, 0x4253, 0x4253, 0x4253, 0x4253, + 0x4253, 0x4253, 0x4250, 0x4250, 0x4250, 0x4250, 0x4250, 0x424f, 0x424f, 0x424d, 0x424d, 0x424d, + 0x424d, 0x424d, 0x424b, 0x424b, 0x424b, 0x424b, 0x424b, 0x4249, 0x4249, 0x4249, 0x4248, 0x4248, + 0x4248, 0x4248, 0x4247, 0x4247, 0x4247, 0x4245, 0x4245, 0x4244, 0x4244, 0x4244, 0x4243, 0x4243, + 0x4241, 0x4241, 0x4241, 0x4240, 0x4240, 0x4240, 0x4240, 0x4240, 0x423e, 0x423e, 0x423e, 0x423e, + 0x423b, 0x423b, 0x423b, 0x423b, 0x423b, 0x423a, 0x423a, 0x423a, 0x4239, 0x4239, 0x4237, 0x4237, + 0x4237, 0x4236, 0x4236, 0x4236, 0x4236, 0x4236, 0x4235, 0x4235, 0x4234, 0x4234, 0x4232, 0x4232, + 0x4232, 0x4232, 0x4232, 0x4231, 0x4231, 0x4231, 0x422f, 0x422f, 0x422d, 0x422d, 0x422d, 0x422d, + 0x422d, 0x422c, 0x422c, 0x422c, 0x422a, 0x422a, 0x422a, 0x422a, 0x4228, 0x4228, 0x4228, 0x4228, + 0x4228, 0x4227, 0x4227, 0x4227, 0x4225, 0x4225, 0x4223, 0x4223, 0x4223, 0x4223, 0x4223, 0x4223, + 0x4223, 0x4221, 0x4220, 0x4220, 0x4220, 0x4220, 0x421f, 0x421f, 0x421f, 0x421d, 0x421d, 0x421d, + 0x421d, 0x421d, 0x421b, 0x421b, 0x421b, 0x421b, 0x421b, 0x4219, 0x4219, 0x4218, 0x4218, 0x4218, + 0x4218, 0x4218, 0x4215, 0x4215, 0x4215, 0x4215, 0x4215, 0x4215, 0x4215, 0x4213, 0x4213, 0x4213, + 0x4212, 0x4212, 0x4211, 0x4211, 0x4211, 0x420f, 0x420f, 0x420f, 0x420f, 0x420d, 0x420d, 0x420d, + 0x420c, 0x420c, 0x420c, 0x420c, 0x420c, 0x420a, 0x420a, 0x4209, 0x4209, 0x4209, 0x4209, 0x4209, + 0x4207, 0x4207, 0x4207, 0x4206, 0x4206, 0x4206, 0x4206, 0x4204, 0x4204, 0x4204, 0x4202, 0x4202, + 0x4202, 0x4202, 0x4202, 0x4201, 0x4201, 0x41fe, 0x41fe, 0x41fe, 0x41fe, 0x41fe, 0x41fb, 0x41fb, + 0x41fb, 0x41fb, 0x41f8, 0x41f8, 0x41f8, 0x41f8, 0x41f8, 0x41f4, 0x41f1, 0x41f1, 0x41f1, 0x41f1, + 0x41f1, 0x41f1, 0x41f1, 0x41ed, 0x41ed, 0x41ed, 0x41ed, 0x41ed, 0x41ea, 0x41ea, 0x41ea, 0x41e6, + 0x41e6, 0x41e6, 0x41e3, 0x41e3, 0x41e3, 0x41e3, 0x41e3, 0x41df, 0x41df, 0x41df, 0x41df, 0x41dc, + 0x41dc, 0x41dc, 0x41dc, 0x41dc, 0x41d8, 0x41d8, 0x41d8, 0x41d8, 0x41d5, 0x41d5, 0x41d5, 0x41d5, + 0x41d5, 0x41d1, 0x41d1, 0x41d1, 0x41cd, 0x41cd, 0x41cd, 0x41cd, 0x41cd, 0x41cd, 0x41cd, 0x41c9, + 0x41c9, 0x41c9, 0x41c6, 0x41c6, 0x41c6, 0x41c6, 0x41c6, 0x41c6, 0x41c6, 0x41c2, 0x41c2, 0x41be, + 0x41be, 0x41be, 0x41be, 0x41be, 0x41be, 0x41ba, 0x41ba, 0x41ba, 0x41ba, 0x41ba, 0x41b6, 0x41b6, + 0x41b6, 0x41b6, 0x41b6, 0x41b6, 0x41b6, 0x41b2, 0x41b2, 0x41ae, 0x41ae, 0x41ae, 0x41ae, 0x41ae, + 0x41ae, 0x41ae, 0x41ae, 0x41aa, 0x41aa, 0x41aa, 0x41aa, 0x41aa, 0x41a6, 0x41a6, 0x41a6, 0x41a6, + 0x41a6, 0x41a2, 0x41a2, 0x41a2, 0x41a2, 0x419e, 0x419e, 0x419e, 0x419e, 0x419e, 0x419e, 0x419a, + 0x419a, 0x419a, 0x419a, 0x419a, 0x4196, 0x4196, 0x4196, 0x4196, 0x4196, 0x4196, 0x4196, 0x4196, + 0x4196, 0x4192, 0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418a, + 0x418a, 0x418a, 0x418a, 0x418a, 0x4186, 0x4186, 0x4186, 0x4186, 0x4186, 0x4186, 0x4186, 0x4181, + 0x4181, 0x4181, 0x4181, 0x4181, 0x417a, 0x417a, 0x417a, 0x417a, 0x417a, 0x417a, 0x417a, 0x417a, + 0x4172, 0x4172, 0x4172, 0x4172, 0x4172, 0x4169, 0x4169, 0x4169, 0x4169, 0x4169, 0x4169, 0x4161, + 0x4161, 0x4161, 0x4161, 0x4161, 0x4161, 0x4158, 0x4158, 0x4158, 0x4158, 0x4158, 0x4158, 0x4158, + 0x4158, 0x4158, 0x414f, 0x414f, 0x414f, 0x414f, 0x414f, 0x4147, 0x4147, 0x4147, 0x4147, 0x4147, + 0x4147, 0x4147, 0x4147, 0x413e, 0x413e, 0x413e, 0x413e, 0x413e, 0x4135, 0x4135, 0x4135, 0x4135, + 0x4135, 0x4135, 0x4135, 0x4135, 0x4135, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x4123, + 0x4123, 0x4123, 0x4123, 0x4123, 0x4123, 0x4123, 0x4123, 0x4123, 0x411a, 0x411a, 0x411a, 0x411a, + 0x411a, 0x411a, 0x4111, 0x4111, 0x4111, 0x4111, 0x4111, 0x4111, 0x4111, 0x4111, 0x4108, 0x4108, + 0x4108, 0x4108, 0x4108, 0x4108, 0x4108, 0x4108, 0x40ff, 0x40ff, 0x40ff, 0x40ff, 0x40ff, 0x40ff, + 0x40ff, 0x40ff, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40db, 0x40db, + 0x40db, 0x40db, 0x40db, 0x40db, 0x40db, 0x40db, 0x40c9, 0x40c9, 0x40c9, 0x40c9, 0x40c9, 0x40c9, + 0x40c9, 0x40c9, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, + 0x40a5, 0x40a5, 0x40a5, 0x40a5, 0x40a5, 0x40a5, 0x40a5, 0x4092, 0x4092, 0x4092, 0x4092, 0x4092, + 0x4092, 0x4092, 0x4092, 0x4092, 0x4080, 0x4080, 0x4080, 0x4080, 0x4080, 0x4080, 0x4080, 0x4080, + 0x4080, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x4037, 0x4037, + 0x4037, 0x4037, 0x4037, 0x4037, 0x4037, 0x4037, 0x4037, 0x4013, 0x4013, 0x4013, 0x4013, 0x4013, + 0x4013, 0x4013, 0x4013, 0x4013, 0x4013, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc, + 0x3fdc, 0x3fdc, 0x3fdc, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, + 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, +}; + +// dist(range_start, range_end); + + float LO = pow(2, range_start); + float HI = pow(2, range_end); + for (uint64_t i = 0; i < ifmap_size; i++) { + // input range is -8 ~ +8 + int table_hw = 256; + float input = + ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002; + input = ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : 1) + 0.03 + (i % table_hw) * 0.002; + input_data[i] = convert_fp32_bf16(input); + input = dist(e2); + input = LO + static_cast(rand()) / (static_cast(RAND_MAX / (HI - LO))); + } +} + +static void gen_input(uint16_t *x, uint16_t *y, uint64_t ifmap_size, TEST_MODE mode, + int range_start, int range_end) { + if (mode == PRE_DATA_COMPARE_FIX) { + memcpy(x, &test_pattern, sizeof(test_pattern)); + } else { + range_start = abs(range_start); + range_end = abs(range_end); + _gen_input(x, ifmap_size, range_start, range_end); + } + + // invert for test + for (uint64_t i = 0; i < ifmap_size; i++) { + y[i] = x[(ifmap_size - 1) - i]; + } + + if (mode == DATA_COMPARE_ACCURACY_X_GT_0) { + // y = any + uint32_t i = 0; + for (; i < ifmap_size / 4; i++) { + // y < 0 + y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i])); + y[i + ifmap_size / 4] = convert_fp32_bf16(0); + } + } else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0) { + // x < 0 and y >= 0 + for (uint32_t i = 0; i < ifmap_size; i++) { + x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i])); + } + + for (uint32_t i = 0; i < ifmap_size / 4; i++) { + y[i + ifmap_size / 4] = convert_fp32_bf16(0); + } + } else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0) { + // x < 0 and y < 0 + for (uint32_t i = 0; i < ifmap_size; i++) { + x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i])); + y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i])); + } + } else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_GT_0) { + // pi / 2, x = 0 and y > 0 + for (uint32_t i = 0; i < ifmap_size; i++) { + x[i] = convert_fp32_bf16(0); + } + } else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_LT_0) { + // -pi / 2, x = 0 and y < 0 + for (uint32_t i = 0; i < ifmap_size; i++) { + x[i] = convert_fp32_bf16(0); + y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i])); + } + } + + if (mode != PRE_DATA_COMPARE_FIX) { + int i = 0; + x[i] = convert_fp32_bf16(-10.0); + y[i++] = convert_fp32_bf16(6.0); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(19.000000); + y[i] = convert_fp32_bf16(5.000000); + x[i++] = convert_fp32_bf16(-125.000000); + y[i] = convert_fp32_bf16(1.070312); + x[i++] = convert_fp32_bf16(0.498046); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-8.000000); + x[i] = convert_fp32_bf16(424.000); + y[i++] = convert_fp32_bf16(-1.00); + x[i] = convert_fp32_bf16(2.484375); + y[i++] = convert_fp32_bf16(-7.531250); + x[i] = convert_fp32_bf16(-2.484375); + y[i++] = convert_fp32_bf16(-7.531250); + x[i] = convert_fp32_bf16(0); + y[i++] = convert_fp32_bf16(7.531250); + x[i] = convert_fp32_bf16(0); + y[i++] = convert_fp32_bf16(-7.531250); + x[i] = convert_fp32_bf16(0); + y[i++] = convert_fp32_bf16(0); + x[i] = convert_fp32_bf16(0); + y[i++] = convert_fp32_bf16(0.394531); + y[i] = convert_fp32_bf16(-4.000000); + x[i++] = convert_fp32_bf16(-64.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-4.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-40.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(-53.000000); + y[i] = convert_fp32_bf16(-9.000000); + x[i++] = convert_fp32_bf16(-91.000000); + y[i] = convert_fp32_bf16(12.000000); + x[i++] = convert_fp32_bf16(-164.000000); + y[i] = convert_fp32_bf16(-20.000000); + x[i++] = convert_fp32_bf16(-320.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(-71.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(-155.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(-247.000000); + y[i] = convert_fp32_bf16(-2.000000); + x[i++] = convert_fp32_bf16(-118.000000); + y[i] = convert_fp32_bf16(-2.000000); + x[i++] = convert_fp32_bf16(-54.000000); + y[i] = convert_fp32_bf16(-5.000000); + x[i++] = convert_fp32_bf16(-392.000000); + y[i] = convert_fp32_bf16(-37.000000); + x[i++] = convert_fp32_bf16(-520.000000); + y[i] = convert_fp32_bf16(-1.000000); + x[i++] = convert_fp32_bf16(-19.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-10.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-8.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-2.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-14.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-2.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-6.000000); + y[i] = convert_fp32_bf16(-1.000000); + x[i++] = convert_fp32_bf16(-21.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-14.000000); + y[i] = convert_fp32_bf16(-1.000000); + x[i++] = convert_fp32_bf16(-17.000000); + y[i] = convert_fp32_bf16(-1.000000); + x[i++] = convert_fp32_bf16(-17.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-8.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-4.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-10.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-8.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-14.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-4.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-2.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(-41.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(-69.000000); + y[i] = convert_fp32_bf16(4.000000); + x[i++] = convert_fp32_bf16(-86.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(-41.000000); + y[i] = convert_fp32_bf16(-2.000000); + x[i++] = convert_fp32_bf16(-34.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-6.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(-41.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-136.000000); + y[i] = convert_fp32_bf16(-3.000000); + x[i++] = convert_fp32_bf16(-79.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-38.000000); + y[i] = convert_fp32_bf16(5.000000); + x[i++] = convert_fp32_bf16(-173.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-78.000000); + y[i] = convert_fp32_bf16(-2.000000); + x[i++] = convert_fp32_bf16(-60.000000); + y[i] = convert_fp32_bf16(3.000000); + x[i++] = convert_fp32_bf16(-123.000000); + y[i] = convert_fp32_bf16(-9.000000); + x[i++] = convert_fp32_bf16(-280.000000); + y[i] = convert_fp32_bf16(3.000000); + x[i++] = convert_fp32_bf16(-39.000000); + y[i] = convert_fp32_bf16(2.000000); + x[i++] = convert_fp32_bf16(-524.000000); + y[i] = convert_fp32_bf16(11.000000); + x[i++] = convert_fp32_bf16(-376.000000); + y[i] = convert_fp32_bf16(5.000000); + x[i++] = convert_fp32_bf16(-131.000000); + y[i] = convert_fp32_bf16(11.000000); + x[i++] = convert_fp32_bf16(-324.000000); + y[i] = convert_fp32_bf16(9.000000); + x[i++] = convert_fp32_bf16(-125.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-92.000000); + y[i] = convert_fp32_bf16(-7.000000); + x[i++] = convert_fp32_bf16(-233.000000); + y[i] = convert_fp32_bf16(10.000000); + x[i++] = convert_fp32_bf16(-170.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-4.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-4.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-10.000000); + y[i] = convert_fp32_bf16(-1.000000); + x[i++] = convert_fp32_bf16(-23.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-6.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-6.000000); + y[i] = convert_fp32_bf16(-3.000000); + x[i++] = convert_fp32_bf16(-37.000000); + + y[i] = convert_fp32_bf16(-9); + x[i++] = convert_fp32_bf16(-1); + + y[i] = convert_fp32_bf16(7.0); + x[i++] = convert_fp32_bf16(-1); + + y[i] = convert_fp32_bf16(0); + x[i++] = convert_fp32_bf16(-1); + } + +#ifdef DBG + for (uint64_t i = 0; i < ifmap_size; i++) { + printf("source[%ld] y %f x %f\n", i, convert_bf16_fp32(y[i]), convert_bf16_fp32(x[i])); + } +#endif /* ifdef DBG */ +} + +static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) { + // TODO: check more shape / align + cvk_chip_info_t chip_info = bmk->info; + + uint32_t input_n = 1; + uint32_t input_c = chip_info.npu_num; + uint32_t input_h = 16; + uint32_t input_w = 16; + float epsilon = 0.2; + int range_start = -8; + int range_end = 8; + + if (mode == PRE_DATA_COMPARE_FIX) { + input_h = 4; + input_w = 8; + } + + cvk_fmt_t fmt = CVK_FMT_BF16; + + cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w}; + cvk_tl_shape_t ofmap_shape = ifmap_shape; + + // get lut table shape and size + cvk_tl_shape_t table_shape; + uint64_t table_bytesize = cvm_lut_tbl_bytesize(bmk, &table_shape, fmt); + + // get input / output size + uint64_t ifmap_size = tl_shape_size(&ifmap_shape); + uint64_t ofmap_size = tl_shape_size(&ofmap_shape); + int data_type_size = bytesize_of_fmt(fmt); + uint64_t ifmap_bytesize = ifmap_size * data_type_size; + uint64_t ofmap_bytesize = ofmap_size * data_type_size; + + // atan2 was two inputs + cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_ifmap2 = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1); + cvk_tl_t *out = tl_ofmap_bf16; + + // atan buf + cvk_tl_t *tl_y0_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_invert_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_pos_neg_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + + // reciprocal buf + cvk_tl_t *tl_reciprocal_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_reciprocal_table_answer_mantissa = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + + // temp buf + cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_buf2 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1); + cvk_tl_t *tl_buf3 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1); + + uint16_t *input_data = (uint16_t *)xmalloc(ifmap_bytesize); + uint16_t *input_data2 = (uint16_t *)xmalloc(ifmap_bytesize); + uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize); + + // for reciprocal + uint16_t *table_reciprocal_data = (uint16_t *)xmalloc(table_bytesize); + uint16_t *table_reciprocal_data_mantissa = (uint16_t *)xmalloc(table_bytesize); + + // for atan + uint16_t *table_data_atan_y0 = (uint16_t *)xmalloc(table_bytesize); + uint16_t *table_data_atan_invert = (uint16_t *)xmalloc(table_bytesize); + uint16_t *table_data_atan_pos_neg = (uint16_t *)xmalloc(table_bytesize); + + // for search '0' index + uint16_t *idx_0_table_data = (uint16_t *)xmalloc(table_bytesize); + + // init input / ref + // input_data is x, input_data2 is y + gen_input(input_data, input_data2, ifmap_size, mode, range_start, range_end); + tl_lut_ref(ref_data, input_data, input_data2, ifmap_shape); + + // init lut table + cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape); + cvm_atan_fast_degree_tbl(table_data_atan_y0, table_data_atan_invert, table_data_atan_pos_neg, + &table_shape); + cvm_gen_0_tbl(idx_0_table_data, &table_shape); + + // sys->local + test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)input_data); + test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap2, (uint8_t *)input_data2); + test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer, (uint8_t *)table_reciprocal_data); + test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer_mantissa, + (uint8_t *)table_reciprocal_data_mantissa); + + test_put_tensor_g2l_comp(ctx, bmk, tl_y0_buf, (uint8_t *)table_data_atan_y0); + test_put_tensor_g2l_comp(ctx, bmk, tl_invert_buf, (uint8_t *)table_data_atan_invert); + test_put_tensor_g2l_comp(ctx, bmk, tl_pos_neg_buf, (uint8_t *)table_data_atan_pos_neg); + + cvm_atan2_fast_degree_emit(bmk, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, + tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer, + tl_reciprocal_table_answer_mantissa, OUT tl_ofmap_bf16, fmt); + + uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, out); + verify(ofmap_data, ref_data, input_data, input_data2, ifmap_size, epsilon); + + free_tl(bmk, tl_buf3); + free_tl(bmk, tl_buf2); + free_tl(bmk, tl_buf); + free_tl(bmk, tl_reciprocal_table_answer_mantissa); + free_tl(bmk, tl_reciprocal_table_answer); + free_tl(bmk, tl_pos_neg_buf); + free_tl(bmk, tl_invert_buf); + free_tl(bmk, tl_y0_buf); + free_tl(bmk, tl_ofmap_bf16); + free_tl(bmk, tl_ifmap2); + free_tl(bmk, tl_ifmap); + + free(table_data_atan_y0); + free(idx_0_table_data); + free(table_data_atan_invert); + free(table_data_atan_pos_neg); + free(table_reciprocal_data); + free(table_reciprocal_data_mantissa); + free(input_data); + free(ref_data); + free(ofmap_data); + free(input_data2); +} + +int main() { + CVI_RT_HANDLE ctx; + cvk_context_t *bmk; + int round_mode; + + round_mode = set_store_feround(); + + test_init(&ctx, &bmk); + + // for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) + // for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_ACCURACY; i++) { + // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_U8; i++) { + // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_ACCURACY_X_GT_0; i++) + // { + for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_U8; i++) { + mode = static_cast(i); + printf("test mode %d...\n", mode); + testbench(&ctx, bmk); + } + printf("pass\n"); + + test_exit(&ctx, bmk); + restore_feround(round_mode); + return 0; +} diff --git a/cvimath/tests/cvi1835/atan2_radian.cpp b/cvimath/tests/cvi1835/atan2_radian.cpp new file mode 100644 index 000000000..3d2189983 --- /dev/null +++ b/cvimath/tests/cvi1835/atan2_radian.cpp @@ -0,0 +1,719 @@ +/** + * \breif atan2 is implemented by atan, you can refer + * [wiki](https://en.wikipedia.org/wiki/Atan2) for more details + */ + +#include +#include + +#define OUT +#define IN +#include +#include +#include +#include +#include +#include +//#define DBG + +/** + * pre_data means we test fixed pattern, it should be same sa lut + */ +enum TEST_MODE { + PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare + DATA_COMPARE_ACCURACY, // generate \range_start to \range_end value that + // check epsilon, default set x > 0, y > 0 + + DATA_COMPARE_ACCURACY_X_GT_0, // atan(y/x), x > 0, y = 0 + DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0, // atan(y/x) + PI , x < 0 and y >= 0 + DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0, // atan(y/x) - PI , x < 0 and y < 0 + DATA_COMPARE_ACCURACY_X_0_Y_GT_0, // pi / 2, x = 0 and y > 0 + DATA_COMPARE_ACCURACY_X_0_Y_LT_0, // -pi / 2, x = 0 and y < 0 + DATA_COMPARE_U8, // generate \range_start to \range_end value that check + // epsilon, result bf16->uint8_t + TEST_MODE_MAX, +}; + +static TEST_MODE mode; + +static uint16_t test_pattern[] = { + 0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52, 0x3A6C, 0x3A83, 0x3A90, + 0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF, 0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17, + 0x3B1D, 0x3B24, 0x3B2A, 0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65, + 0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90, 0x3B93, 0x3B97, 0x3B9A, + 0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE, 0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1, + 0x3BC5, 0x3BC8, 0x3BCB, 0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9, + 0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03, 0x3C05, 0x3C06, 0x3C08, + 0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12, 0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C, + 0x3C1D, 0x3C1F, 0x3C21, 0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F, + 0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E, 0x3C40, 0x3C41, 0x3C43, + 0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D, 0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57, + 0x3C58, 0x3C5A, 0x3C5C, 0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A, + 0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79, 0x3C7B, 0x3C7C, 0x3C7E, + 0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84, 0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89, + 0x3C8A, 0x3C8A, 0x3C8B, 0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93, + 0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A, 0x3C9B, 0x3C9C, 0x3C9C, + 0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1, 0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6, + 0x3CA7, 0x3CA8, 0x3CA9, 0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0, + 0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8, 0x3CB8, 0x3CB9, 0x3CBA, + 0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF, 0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4, + 0x3CC5, 0x3CC5, 0x3CC6, 0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE, + 0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5, 0x3CD6, 0x3CD7, 0x3CD7, + 0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC, 0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1, + 0x3CE2, 0x3CE3, 0x3CE4, 0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB, + 0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2, 0x3CF3, 0x3CF4, 0x3CF5, + 0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA, 0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF, + 0x3D00, 0x3D00, 0x3D01, 0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04, + 0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08, 0x3D08, 0x3D09, 0x3D09, + 0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C, 0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E, + 0x3D0F, 0x3D0F, 0x3D0F, 0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13, + 0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17, 0x3D17, 0x3D18, 0x3D18, + 0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A, 0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D, + 0x3D1D, 0x3D1E, 0x3D1E, 0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22, + 0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25, 0x3D26, 0x3D26, 0x3D27, + 0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29, 0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C, + 0x3D2C, 0x3D2C, 0x3D2D, 0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31, + 0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34, 0x3D35, 0x3D35, 0x3D35, + 0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38, 0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A, + 0x3D3B, 0x3D3B, 0x3D3C, 0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F, + 0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43, 0x3D43, 0x3D44, 0x3D44, + 0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47, 0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49, + 0x3D4A, 0x3D4A, 0x3D4A, 0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E, + 0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52, 0x3D52, 0x3D53, 0x3D53, + 0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55, 0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58, + 0x3D58, 0x3D59, 0x3D59, 0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D, + 0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60, 0x3D61, 0x3D61, 0x3D62, + 0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64, 0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67, + 0x3D67, 0x3D67, 0x3D68, 0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C, + 0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F, 0x3D70, 0x3D70, 0x3D70, + 0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73, 0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75, + 0x3D76, 0x3D76, 0x3D77, 0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A, + 0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E, 0x3D7E, 0x3D7F, 0x3D7F, + 0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82, + 0x3D82, 0x3D82, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85, + 0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D87, 0x3D87, 0x3D87, + 0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89, + 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C, + 0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, + 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91, + 0x3D91, 0x3D91, 0x3D91, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93, + 0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95, 0x3D95, 0x3D96, 0x3D96, + 0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98, + 0x3D98, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B, + 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9D, 0x3D9D, 0x3D9D, + 0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0, + 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2, + 0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, + 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7, + 0x3DA7, 0x3DA7, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, + 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAC, 0x3DAC, + 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, + 0x3DAE, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1, + 0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3, + 0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6, + 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8, + 0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBB, + 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD, + 0x3DBD, 0x3DBD, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, + 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC2, 0x3DC2, 0x3DC2, + 0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, + 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7, + 0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9, 0x3DC9, 0x3DC9, 0x3DC9, + 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC, + 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5, + 0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE, 0x3DDF, 0x3DE0, 0x3DE1, + 0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5, +}; + +static uint16_t golden_bf16[] = { + 0x3fc9, 0x3fc9, 0x3fc9, 0x3fc9, 0x3fc9, 0x3fc9, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, + 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, + 0x3fc7, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc4, + 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc3, 0x3fc3, 0x3fc3, + 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1, + 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, + 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbe, 0x3fbe, 0x3fbe, + 0x3fbe, 0x3fbe, 0x3fbe, 0x3fbe, 0x3fbe, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, + 0x3fbc, 0x3fbc, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fba, 0x3fba, + 0x3fba, 0x3fba, 0x3fba, 0x3fba, 0x3fba, 0x3fba, 0x3fb9, 0x3fb9, 0x3fb9, 0x3fb9, 0x3fb9, 0x3fb9, + 0x3fb9, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb6, 0x3fb6, + 0x3fb6, 0x3fb6, 0x3fb6, 0x3fb6, 0x3fb6, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5, + 0x3fb5, 0x3fb5, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb2, 0x3fb2, 0x3fb2, + 0x3fb2, 0x3fb2, 0x3fb2, 0x3fb2, 0x3fb2, 0x3fb1, 0x3fb1, 0x3fb1, 0x3fb1, 0x3fb1, 0x3fb1, 0x3fb0, + 0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3faf, 0x3faf, 0x3faf, 0x3faf, + 0x3faf, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fac, 0x3fac, 0x3fac, + 0x3fac, 0x3fac, 0x3fac, 0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3faa, + 0x3faa, 0x3faa, 0x3faa, 0x3faa, 0x3faa, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa7, + 0x3fa7, 0x3fa7, 0x3fa7, 0x3fa7, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6, + 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa4, 0x3fa4, 0x3fa4, 0x3fa4, 0x3fa4, 0x3fa4, + 0x3fa3, 0x3fa3, 0x3fa3, 0x3fa3, 0x3fa3, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1, + 0x3fa1, 0x3fa0, 0x3fa0, 0x3fa0, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f, + 0x3f9e, 0x3f9e, 0x3f9e, 0x3f9e, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, + 0x3f9c, 0x3f9c, 0x3f9c, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f99, 0x3f99, + 0x3f99, 0x3f98, 0x3f98, 0x3f98, 0x3f98, 0x3f98, 0x3f98, 0x3f97, 0x3f97, 0x3f97, 0x3f97, 0x3f97, + 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f95, 0x3f95, 0x3f94, 0x3f94, + 0x3f94, 0x3f94, 0x3f94, 0x3f94, 0x3f94, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f92, 0x3f92, + 0x3f91, 0x3f91, 0x3f91, 0x3f91, 0x3f91, 0x3f90, 0x3f90, 0x3f90, 0x3f90, 0x3f90, 0x3f90, 0x3f90, + 0x3f8f, 0x3f8f, 0x3f8f, 0x3f8e, 0x3f8e, 0x3f8e, 0x3f8e, 0x3f8e, 0x3f8d, 0x3f8d, 0x3f8d, 0x3f8c, + 0x3f8c, 0x3f8c, 0x3f8c, 0x3f8c, 0x3f8c, 0x3f8b, 0x3f8b, 0x3f8b, 0x3f8a, 0x3f8a, 0x3f8a, 0x3f8a, + 0x3f8a, 0x3f8a, 0x3f89, 0x3f89, 0x3f89, 0x3f88, 0x3f88, 0x3f88, 0x3f88, 0x3f88, 0x3f87, 0x3f87, + 0x3f87, 0x3f86, 0x3f86, 0x3f86, 0x3f86, 0x3f86, 0x3f86, 0x3f86, 0x3f85, 0x3f84, 0x3f84, 0x3f84, + 0x3f84, 0x3f84, 0x3f83, 0x3f83, 0x3f83, 0x3f83, 0x3f82, 0x3f82, 0x3f82, 0x3f82, 0x3f82, 0x3f81, + 0x3f81, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7d, 0x3f7d, 0x3f7d, + 0x3f7d, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f78, 0x3f78, 0x3f76, + 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f74, 0x3f74, 0x3f74, 0x3f72, 0x3f72, 0x3f72, 0x3f72, 0x3f71, + 0x3f71, 0x3f71, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6c, 0x3f6c, 0x3f6c, 0x3f6c, 0x3f6c, + 0x3f6c, 0x3f6c, 0x3f69, 0x3f69, 0x3f69, 0x3f69, 0x3f69, 0x3f67, 0x3f67, 0x3f65, 0x3f65, 0x3f65, + 0x3f65, 0x3f65, 0x3f62, 0x3f62, 0x3f62, 0x3f62, 0x3f62, 0x3f61, 0x3f61, 0x3f61, 0x3f5f, 0x3f5f, + 0x3f5f, 0x3f5f, 0x3f5e, 0x3f5e, 0x3f5e, 0x3f5c, 0x3f5c, 0x3f5b, 0x3f5b, 0x3f5b, 0x3f59, 0x3f59, + 0x3f58, 0x3f58, 0x3f58, 0x3f57, 0x3f57, 0x3f57, 0x3f57, 0x3f57, 0x3f54, 0x3f54, 0x3f54, 0x3f54, + 0x3f51, 0x3f51, 0x3f51, 0x3f51, 0x3f51, 0x3f50, 0x3f50, 0x3f50, 0x3f4e, 0x3f4e, 0x3f4d, 0x3f4d, + 0x3f4d, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4a, 0x3f4a, 0x3f49, 0x3f49, 0x3f46, 0x3f46, + 0x3f46, 0x3f46, 0x3f46, 0x3f45, 0x3f45, 0x3f45, 0x3f44, 0x3f44, 0x3f41, 0x3f41, 0x3f41, 0x3f41, + 0x3f41, 0x3f40, 0x3f40, 0x3f40, 0x3f3e, 0x3f3e, 0x3f3e, 0x3f3e, 0x3f3c, 0x3f3c, 0x3f3c, 0x3f3c, + 0x3f3c, 0x3f3a, 0x3f3a, 0x3f3a, 0x3f39, 0x3f39, 0x3f36, 0x3f36, 0x3f36, 0x3f36, 0x3f36, 0x3f36, + 0x3f36, 0x3f34, 0x3f33, 0x3f33, 0x3f33, 0x3f33, 0x3f31, 0x3f31, 0x3f31, 0x3f30, 0x3f30, 0x3f30, + 0x3f30, 0x3f30, 0x3f2d, 0x3f2d, 0x3f2d, 0x3f2d, 0x3f2d, 0x3f2b, 0x3f2b, 0x3f2a, 0x3f2a, 0x3f2a, + 0x3f2a, 0x3f2a, 0x3f26, 0x3f26, 0x3f26, 0x3f26, 0x3f26, 0x3f26, 0x3f26, 0x3f25, 0x3f25, 0x3f25, + 0x3f23, 0x3f23, 0x3f21, 0x3f21, 0x3f21, 0x3f20, 0x3f20, 0x3f20, 0x3f20, 0x3f1e, 0x3f1e, 0x3f1e, + 0x3f1c, 0x3f1c, 0x3f1c, 0x3f1c, 0x3f1c, 0x3f1b, 0x3f1b, 0x3f19, 0x3f19, 0x3f19, 0x3f19, 0x3f19, + 0x3f17, 0x3f17, 0x3f17, 0x3f15, 0x3f15, 0x3f15, 0x3f15, 0x3f14, 0x3f14, 0x3f14, 0x3f12, 0x3f12, + 0x3f12, 0x3f12, 0x3f12, 0x3f10, 0x3f10, 0x3f0e, 0x3f0e, 0x3f0e, 0x3f0e, 0x3f0e, 0x3f0c, 0x3f0c, + 0x3f0c, 0x3f0c, 0x3f0a, 0x3f0a, 0x3f0a, 0x3f0a, 0x3f0a, 0x3f08, 0x3f07, 0x3f07, 0x3f07, 0x3f07, + 0x3f07, 0x3f07, 0x3f07, 0x3f05, 0x3f05, 0x3f05, 0x3f05, 0x3f05, 0x3f03, 0x3f03, 0x3f03, 0x3f01, + 0x3f01, 0x3f01, 0x3efe, 0x3efe, 0x3efe, 0x3efe, 0x3efe, 0x3efa, 0x3efa, 0x3efa, 0x3efa, 0x3ef6, + 0x3ef6, 0x3ef6, 0x3ef6, 0x3ef6, 0x3ef1, 0x3ef1, 0x3ef1, 0x3ef1, 0x3eed, 0x3eed, 0x3eed, 0x3eed, + 0x3eed, 0x3ee9, 0x3ee9, 0x3ee9, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee1, + 0x3ee1, 0x3ee1, 0x3edd, 0x3edd, 0x3edd, 0x3edd, 0x3edd, 0x3edd, 0x3edd, 0x3ed9, 0x3ed9, 0x3ed4, + 0x3ed4, 0x3ed4, 0x3ed4, 0x3ed4, 0x3ed4, 0x3ed0, 0x3ed0, 0x3ed0, 0x3ed0, 0x3ed0, 0x3ecc, 0x3ecc, + 0x3ecc, 0x3ecc, 0x3ecc, 0x3ecc, 0x3ecc, 0x3ec7, 0x3ec7, 0x3ec3, 0x3ec3, 0x3ec3, 0x3ec3, 0x3ec3, + 0x3ec3, 0x3ec3, 0x3ec3, 0x3ebe, 0x3ebe, 0x3ebe, 0x3ebe, 0x3ebe, 0x3eba, 0x3eba, 0x3eba, 0x3eba, + 0x3eba, 0x3eb5, 0x3eb5, 0x3eb5, 0x3eb5, 0x3eb1, 0x3eb1, 0x3eb1, 0x3eb1, 0x3eb1, 0x3eb1, 0x3eac, + 0x3eac, 0x3eac, 0x3eac, 0x3eac, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8, + 0x3ea8, 0x3ea3, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9a, + 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e91, + 0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, + 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e7b, + 0x3e7b, 0x3e7b, 0x3e7b, 0x3e7b, 0x3e7b, 0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e71, + 0x3e71, 0x3e71, 0x3e67, 0x3e67, 0x3e67, 0x3e67, 0x3e67, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e, + 0x3e5e, 0x3e5e, 0x3e5e, 0x3e54, 0x3e54, 0x3e54, 0x3e54, 0x3e54, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, + 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e40, 0x3e40, 0x3e40, 0x3e40, 0x3e40, 0x3e40, 0x3e36, + 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e2c, 0x3e2c, 0x3e2c, 0x3e2c, + 0x3e2c, 0x3e2c, 0x3e22, 0x3e22, 0x3e22, 0x3e22, 0x3e22, 0x3e22, 0x3e22, 0x3e22, 0x3e18, 0x3e18, + 0x3e18, 0x3e18, 0x3e18, 0x3e18, 0x3e18, 0x3e18, 0x3e0e, 0x3e0e, 0x3e0e, 0x3e0e, 0x3e0e, 0x3e0e, + 0x3e0e, 0x3e0e, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3df5, 0x3df5, + 0x3df5, 0x3df5, 0x3df5, 0x3df5, 0x3df5, 0x3df5, 0x3de0, 0x3de0, 0x3de0, 0x3de0, 0x3de0, 0x3de0, + 0x3de0, 0x3de0, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, + 0x3db8, 0x3db8, 0x3db8, 0x3db8, 0x3db8, 0x3db8, 0x3db8, 0x3da3, 0x3da3, 0x3da3, 0x3da3, 0x3da3, + 0x3da3, 0x3da3, 0x3da3, 0x3da3, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, + 0x3d8f, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d4d, 0x3d4d, + 0x3d4d, 0x3d4d, 0x3d4d, 0x3d4d, 0x3d4d, 0x3d4d, 0x3d4d, 0x3d24, 0x3d24, 0x3d24, 0x3d24, 0x3d24, + 0x3d24, 0x3d24, 0x3d24, 0x3d24, 0x3d24, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6, + 0x3cf6, 0x3cf6, 0x3cf6, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, + 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x0, 0x0, + 0x0, 0x0, 0x0, +}; + +// = tolerant_max) { + exit(-1); + } + } + } + + return true; +} + +static void _gen_input(uint16_t *input_data, uint64_t ifmap_size, int range_start, int range_end) { + std::random_device rd; + std::mt19937 e2(rd()); + std::uniform_real_distribution<> dist(range_start, range_end); + + float LO = pow(2, range_start); + float HI = pow(2, range_end); + for (uint64_t i = 0; i < ifmap_size; i++) { + // input range is -8 ~ +8 + int table_hw = 256; + float input = + ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002; + input = ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : 1) + 0.03 + (i % table_hw) * 0.002; + input_data[i] = convert_fp32_bf16(input); + input = dist(e2); + input = LO + static_cast(rand()) / (static_cast(RAND_MAX / (HI - LO))); + } +} + +static void gen_input(uint16_t *x, uint16_t *y, uint64_t ifmap_size, TEST_MODE mode, + int range_start, int range_end) { + if (mode == PRE_DATA_COMPARE_FIX) { + memcpy(x, &test_pattern, sizeof(test_pattern)); + } else { + range_start = abs(range_start); + range_end = abs(range_end); + _gen_input(x, ifmap_size, range_start, range_end); + } + + // invert for test + for (uint64_t i = 0; i < ifmap_size; i++) { + y[i] = x[(ifmap_size - 1) - i]; + } + + if (mode == DATA_COMPARE_ACCURACY_X_GT_0) { + // y = any + uint32_t i = 0; + for (; i < ifmap_size / 4; i++) { + // y < 0 + y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i])); + y[i + ifmap_size / 4] = convert_fp32_bf16(0); + } + } else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0) { + // x < 0 and y >= 0 + for (uint32_t i = 0; i < ifmap_size; i++) { + x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i])); + } + + for (uint32_t i = 0; i < ifmap_size / 4; i++) { + y[i + ifmap_size / 4] = convert_fp32_bf16(0); + } + } else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0) { + // x < 0 and y < 0 + for (uint32_t i = 0; i < ifmap_size; i++) { + x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i])); + y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i])); + } + } else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_GT_0) { + // pi / 2, x = 0 and y > 0 + for (uint32_t i = 0; i < ifmap_size; i++) { + x[i] = convert_fp32_bf16(0); + } + } else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_LT_0) { + // -pi / 2, x = 0 and y < 0 + for (uint32_t i = 0; i < ifmap_size; i++) { + x[i] = convert_fp32_bf16(0); + y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i])); + } + } + +#if 1 + + if (mode != PRE_DATA_COMPARE_FIX) { + int i = 0; + x[i] = convert_fp32_bf16(0); + y[i++] = convert_fp32_bf16(1.394531); + x[i] = convert_fp32_bf16(0); + y[i++] = convert_fp32_bf16(0.394531); + x[i] = convert_fp32_bf16(0); + y[i++] = convert_fp32_bf16(0.594531); + x[i] = convert_fp32_bf16(-10.0); + y[i++] = convert_fp32_bf16(6.0); + x[i] = convert_fp32_bf16(1.0); + y[i++] = convert_fp32_bf16(-1.); + x[i] = convert_fp32_bf16(-1.0); + y[i++] = convert_fp32_bf16(1.); + x[i] = convert_fp32_bf16(0.111816); + y[i++] = convert_fp32_bf16(0); + x[i] = convert_fp32_bf16(2.031250); + y[i++] = convert_fp32_bf16(0.0); + x[i] = convert_fp32_bf16(-2.031250); + x[i] = convert_fp32_bf16(0); + y[i++] = convert_fp32_bf16(-1.394531); + y[i++] = convert_fp32_bf16(0.0); + x[i] = convert_fp32_bf16(0); + y[i++] = convert_fp32_bf16(-6.0); + x[i] = convert_fp32_bf16(0); + y[i++] = convert_fp32_bf16(-0.394531); + x[i] = convert_fp32_bf16(0); + y[i++] = convert_fp32_bf16(-0.594531); + x[i] = convert_fp32_bf16(0); + y[i++] = convert_fp32_bf16(0.0); + x[i] = convert_fp32_bf16(-8); + y[i++] = convert_fp32_bf16(0); + x[i] = convert_fp32_bf16(0); + y[i++] = convert_fp32_bf16(3.0); + x[i] = convert_fp32_bf16(-1.0); + y[i++] = convert_fp32_bf16(-5.0); + x[i] = convert_fp32_bf16(-2.484375); + y[i++] = convert_fp32_bf16(-7.531250); + x[i++] = convert_fp32_bf16(-125.000000); + y[i] = convert_fp32_bf16(5.000000); + x[i++] = convert_fp32_bf16(-8.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(19.000000); + y[i] = convert_fp32_bf16(1.070312); + x[i++] = convert_fp32_bf16(0.498046); + y[i] = convert_fp32_bf16(0.000000); + x[i] = convert_fp32_bf16(424.000); + y[i++] = convert_fp32_bf16(-1.00); + x[i] = convert_fp32_bf16(2.484375); + y[i++] = convert_fp32_bf16(-7.531250); + x[i] = convert_fp32_bf16(0); + y[i++] = convert_fp32_bf16(7.531250); + + x[i] = convert_fp32_bf16(0); + y[i++] = convert_fp32_bf16(-7.531250); + + x[i] = convert_fp32_bf16(0); + y[i++] = convert_fp32_bf16(0.394531); + y[i] = convert_fp32_bf16(-4.000000); + x[i++] = convert_fp32_bf16(-64.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-4.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-40.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(-53.000000); + y[i] = convert_fp32_bf16(-9.000000); + x[i++] = convert_fp32_bf16(-91.000000); + y[i] = convert_fp32_bf16(12.000000); + x[i++] = convert_fp32_bf16(-164.000000); + y[i] = convert_fp32_bf16(-20.000000); + x[i++] = convert_fp32_bf16(-320.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(-71.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(-155.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(-247.000000); + y[i] = convert_fp32_bf16(-2.000000); + x[i++] = convert_fp32_bf16(-118.000000); + y[i] = convert_fp32_bf16(-2.000000); + x[i++] = convert_fp32_bf16(-54.000000); + y[i] = convert_fp32_bf16(-5.000000); + x[i++] = convert_fp32_bf16(-392.000000); + y[i] = convert_fp32_bf16(-37.000000); + x[i++] = convert_fp32_bf16(-520.000000); + y[i] = convert_fp32_bf16(-1.000000); + x[i++] = convert_fp32_bf16(-19.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-10.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-8.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-2.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-14.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-2.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-6.000000); + y[i] = convert_fp32_bf16(-1.000000); + x[i++] = convert_fp32_bf16(-21.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-14.000000); + y[i] = convert_fp32_bf16(-1.000000); + x[i++] = convert_fp32_bf16(-17.000000); + y[i] = convert_fp32_bf16(-1.000000); + x[i++] = convert_fp32_bf16(-17.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-8.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-4.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-10.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-8.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-14.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-4.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-2.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(-41.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(-69.000000); + y[i] = convert_fp32_bf16(4.000000); + x[i++] = convert_fp32_bf16(-86.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(-41.000000); + y[i] = convert_fp32_bf16(-2.000000); + x[i++] = convert_fp32_bf16(-34.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-6.000000); + y[i] = convert_fp32_bf16(1.000000); + x[i++] = convert_fp32_bf16(-41.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-136.000000); + y[i] = convert_fp32_bf16(-3.000000); + x[i++] = convert_fp32_bf16(-79.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-38.000000); + y[i] = convert_fp32_bf16(5.000000); + x[i++] = convert_fp32_bf16(-173.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-78.000000); + y[i] = convert_fp32_bf16(-2.000000); + x[i++] = convert_fp32_bf16(-60.000000); + y[i] = convert_fp32_bf16(3.000000); + x[i++] = convert_fp32_bf16(-123.000000); + y[i] = convert_fp32_bf16(-9.000000); + x[i++] = convert_fp32_bf16(-280.000000); + y[i] = convert_fp32_bf16(3.000000); + x[i++] = convert_fp32_bf16(-39.000000); + y[i] = convert_fp32_bf16(2.000000); + x[i++] = convert_fp32_bf16(-524.000000); + y[i] = convert_fp32_bf16(11.000000); + x[i++] = convert_fp32_bf16(-376.000000); + y[i] = convert_fp32_bf16(5.000000); + x[i++] = convert_fp32_bf16(-131.000000); + y[i] = convert_fp32_bf16(11.000000); + x[i++] = convert_fp32_bf16(-324.000000); + y[i] = convert_fp32_bf16(9.000000); + x[i++] = convert_fp32_bf16(-125.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-92.000000); + y[i] = convert_fp32_bf16(-7.000000); + x[i++] = convert_fp32_bf16(-233.000000); + y[i] = convert_fp32_bf16(10.000000); + x[i++] = convert_fp32_bf16(-170.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-4.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-4.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-10.000000); + y[i] = convert_fp32_bf16(-1.000000); + x[i++] = convert_fp32_bf16(-23.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-6.000000); + y[i] = convert_fp32_bf16(0.000000); + x[i++] = convert_fp32_bf16(-6.000000); + y[i] = convert_fp32_bf16(-3.000000); + x[i++] = convert_fp32_bf16(-37.000000); + + y[i] = convert_fp32_bf16(-9); + x[i++] = convert_fp32_bf16(-1); + + y[i] = convert_fp32_bf16(7.0); + x[i++] = convert_fp32_bf16(-1); + + y[i] = convert_fp32_bf16(0); + x[i++] = convert_fp32_bf16(-1); + } +#else + for (uint64_t i = 0; i < ifmap_size; i++) { + x[i] = convert_fp32_bf16(5.375000); + y[i] = convert_fp32_bf16(2.203125); + } +#endif + +#ifdef DBG + for (uint64_t i = 0; i < ifmap_size; i++) { + printf("source[%ld] y %f x %f\n", i, convert_bf16_fp32(y[i]), convert_bf16_fp32(x[i])); + } +#endif /* ifdef DBG */ +} + +static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) { + // TODO: check more shape / align + cvk_chip_info_t chip_info = bmk->info; + + uint32_t input_n = 1; + uint32_t input_c = chip_info.npu_num; + uint32_t input_h = 16; + uint32_t input_w = 16; + float epsilon = 0.1; + int range_start = -8; + int range_end = 8; + + if (mode == PRE_DATA_COMPARE_FIX) { + input_h = 4; + input_w = 8; + } + + cvk_fmt_t fmt = CVK_FMT_BF16; + + cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w}; + cvk_tl_shape_t ofmap_shape = ifmap_shape; + + // get lut table shape and size + cvk_tl_shape_t table_shape; + uint64_t table_bytesize = cvm_lut_tbl_bytesize(bmk, &table_shape, fmt); + + // get input / output size + uint64_t ifmap_size = tl_shape_size(&ifmap_shape); + uint64_t ofmap_size = tl_shape_size(&ofmap_shape); + int data_type_size = bytesize_of_fmt(fmt); + uint64_t ifmap_bytesize = ifmap_size * data_type_size; + + uint64_t ofmap_bytesize = ofmap_size * data_type_size; + if (mode == PRE_DATA_COMPARE_FIX) { + ofmap_bytesize = sizeof(golden_bf16) / sizeof(golden_bf16[0]) * data_type_size; + } + + // atan2 was two inputs + cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_ifmap2 = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1); + cvk_tl_t *out = tl_ofmap_bf16; + + // atan buf + cvk_tl_t *tl_y0_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_invert_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_pos_neg_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + + // reciprocal buf + cvk_tl_t *tl_reciprocal_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_reciprocal_table_answer_mantissa = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + + // temp buf + cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_buf2 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1); + cvk_tl_t *tl_buf3 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1); + + uint16_t *input_data = (uint16_t *)xmalloc(ifmap_bytesize); + uint16_t *input_data2 = (uint16_t *)xmalloc(ifmap_bytesize); + uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize); + + // for reciprocal + uint16_t *table_reciprocal_data = (uint16_t *)xmalloc(table_bytesize); + uint16_t *table_reciprocal_data_mantissa = (uint16_t *)xmalloc(table_bytesize); + + // for atan + uint16_t *table_data_atan_y0 = (uint16_t *)xmalloc(table_bytesize); + uint16_t *table_data_atan_invert = (uint16_t *)xmalloc(table_bytesize); + uint16_t *table_data_atan_pos_neg = (uint16_t *)xmalloc(table_bytesize); + + // for search '0' index + uint16_t *idx_0_table_data = (uint16_t *)xmalloc(table_bytesize); + + // init input / ref + // input_data is x, input_data2 is y + gen_input(input_data, input_data2, ifmap_size, mode, range_start, range_end); + tl_lut_ref(ref_data, input_data, input_data2, ifmap_shape); + + // init lut table + cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape); + cvm_atan_tbl(table_data_atan_y0, NULL, table_data_atan_invert, table_data_atan_pos_neg, + &table_shape); + cvm_gen_0_tbl(idx_0_table_data, &table_shape); + + // sys->local + test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)input_data); + test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap2, (uint8_t *)input_data2); + test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer, (uint8_t *)table_reciprocal_data); + test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer_mantissa, + (uint8_t *)table_reciprocal_data_mantissa); + + test_put_tensor_g2l_comp(ctx, bmk, tl_y0_buf, (uint8_t *)table_data_atan_y0); + test_put_tensor_g2l_comp(ctx, bmk, tl_invert_buf, (uint8_t *)table_data_atan_invert); + test_put_tensor_g2l_comp(ctx, bmk, tl_pos_neg_buf, (uint8_t *)table_data_atan_pos_neg); + + cvm_atan2_merge_emit(bmk, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, tl_invert_buf, + tl_pos_neg_buf, tl_reciprocal_table_answer, + tl_reciprocal_table_answer_mantissa, OUT tl_ofmap_bf16, fmt); + + uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, out); + verify(ofmap_data, ref_data, input_data, input_data2, ifmap_size, epsilon); + + free_tl(bmk, tl_buf3); + free_tl(bmk, tl_buf2); + free_tl(bmk, tl_buf); + free_tl(bmk, tl_reciprocal_table_answer_mantissa); + free_tl(bmk, tl_reciprocal_table_answer); + free_tl(bmk, tl_pos_neg_buf); + free_tl(bmk, tl_invert_buf); + free_tl(bmk, tl_y0_buf); + free_tl(bmk, tl_ofmap_bf16); + free_tl(bmk, tl_ifmap2); + free_tl(bmk, tl_ifmap); + + free(idx_0_table_data); + free(table_data_atan_y0); + free(table_data_atan_invert); + free(table_data_atan_pos_neg); + free(table_reciprocal_data); + free(table_reciprocal_data_mantissa); + free(input_data); + free(ref_data); + free(ofmap_data); + free(input_data2); +} + +int main() { + CVI_RT_HANDLE ctx; + cvk_context_t *bmk; + int round_mode; + + round_mode = set_store_feround(); + + test_init(&ctx, &bmk); + + // for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) + // for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_ACCURACY; i++) { + // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_U8; i++) { + // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_ACCURACY_X_GT_0; i++) + // { + for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_U8; i++) { + mode = static_cast(i); + printf("test mode %d...\n", mode); + testbench(&ctx, bmk); + } + printf("pass\n"); + + test_exit(&ctx, bmk); + restore_feround(round_mode); + return 0; +} diff --git a/cvimath/tests/cvi1835/bf16_fp32.cpp b/cvimath/tests/cvi1835/bf16_fp32.cpp new file mode 100644 index 000000000..781dbc3fd --- /dev/null +++ b/cvimath/tests/cvi1835/bf16_fp32.cpp @@ -0,0 +1,148 @@ +// \file mask sample for gt(great than), ge(great equal), eq(equal), lt(less than), le(less equal) + +// header include +#include +#include // math +#include // kerenl + +void init_input(uint16_t *input_data, uint64_t ifmap_size) { + for (uint64_t i = 0; i < ifmap_size; i++) { + input_data[i] = convert_fp32_bf16(i * 1.0); + } +} + +void init_ref(uint16_t *input_data, uint32_t *ref_data, uint64_t ifmap_size) { + union s { + uint16_t int16[2]; // big endian + uint32_t int32; + }; + union s _s; + for (uint64_t i = 0; i < ifmap_size; i++) { + _s.int16[0] = 0; + _s.int16[1] = input_data[i]; + ref_data[i] = _s.int32; + } +} + +static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, + cvk_tg_shape_t *bf16_tg_shape) { + // for calculate size we need in host + cvk_tl_shape_t ifmap_shape = {bf16_tg_shape->n, bf16_tg_shape->c, bf16_tg_shape->h, + bf16_tg_shape->w}; + + // * 2 means fp32 takes twice size of bf16 + cvk_tl_shape_t ofmap_shape = {bf16_tg_shape->n, bf16_tg_shape->c, bf16_tg_shape->h, + bf16_tg_shape->w * 2}; + + uint64_t ifmap_size = tl_shape_size(&ifmap_shape); + uint64_t ofmap_size = tl_shape_size(&ofmap_shape); + + // unit size is 1 bytes, bf16 takes 2 bytes + int data_type_size = 2; + + uint64_t ifmap_bytesize = ifmap_size * data_type_size; + + // * 2 means fp32 takes twice size of bf16 + uint64_t ofmap_bytesize = ofmap_size * data_type_size * 2; + + uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize); + uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize); + + // init input / output data in ddr + init_input((uint16_t *)input_data, ifmap_size); + init_ref((uint16_t *)input_data, (uint32_t *)ref_data, ifmap_size); + + // send host memory->device memory + cvk_fmt_t fmt = CVK_FMT_BF16; + cvk_tg_shape_t fp32_tg_shape; + fp32_tg_shape = {ofmap_shape.n, ofmap_shape.c, ofmap_shape.h, ofmap_shape.w}; + + cvk_tg_t *bf16_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, *bf16_tg_shape, fmt); + assert(bf16_tg && "alloc bf16 fail"); + + test_put_tg_mem_comp(rt_ctx, bf16_tg, (uint8_t *)input_data); + + cvk_tg_t *fp32_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, fp32_tg_shape, fmt); + assert(bf16_tg && "alloc fp32 fail"); + + // prepare command buffer + cvm_bf16_fp32(cvk_ctx, bf16_tg, fp32_tg); + + // submit descriptor + test_submit_comp(rt_ctx, cvk_ctx); + + // get data from tl + uint8_t *ofmap_data = test_get_tg_mem_comp(rt_ctx, fp32_tg); + + // compare with reference with byte + for (uint32_t i = 0; i < ofmap_size; i++) { + if (ref_data[i] != ofmap_data[i]) { + fprintf(stderr, "comparing failed output[%u] got %u, ref %u\n", i, ofmap_data[i], + ref_data[i]); + // fail case + exit(-1); + } + } + + // free resource from tpu memory + test_free_tg_mem_comp(rt_ctx, bf16_tg); + test_free_tg_mem_comp(rt_ctx, fp32_tg); + + // free resource from host memory + free(input_data); + free(ref_data); + free(ofmap_data); +} + +int main() { + CVI_RT_HANDLE rt_ctx; + cvk_context_t *cvk_ctx; + int round_mode; + + // align kerenl rounding mode + round_mode = set_store_feround(); + + // init runtime / kerenl structure + test_init(&rt_ctx, &cvk_ctx); + + cvk_tg_shape_t bf16_tg_shape = {1, 2, 3, 4}; + { + // test 1 + printf("test bf16 <%d,%d,%d,%d> to fp32\n", bf16_tg_shape.n, bf16_tg_shape.c, bf16_tg_shape.h, + bf16_tg_shape.w); + testbench(&rt_ctx, cvk_ctx, &bf16_tg_shape); + printf("compare test bf16 to fp32 done\n"); + } + + { + // test 2 + bf16_tg_shape = {1, 20, 30, 40}; + printf("test bf16 <%d,%d,%d,%d> to fp32\n", bf16_tg_shape.n, bf16_tg_shape.c, bf16_tg_shape.h, + bf16_tg_shape.w); + testbench(&rt_ctx, cvk_ctx, &bf16_tg_shape); + printf("compare test bf16 to fp32 done\n"); + } + + bf16_tg_shape = {40, 40, 128, 256}; + for (int n = 1; n < (int)bf16_tg_shape.n; n += 10) { + for (int c = 1; c < (int)bf16_tg_shape.c; c += 10) { + for (int h = 1; h < (int)bf16_tg_shape.h; h += 100) { + for (int w = 2; w < (int)bf16_tg_shape.w; w += 100) { + printf("test bf16 <%d,%d,%d,%d> to fp32\n", bf16_tg_shape.n, bf16_tg_shape.c, + bf16_tg_shape.h, bf16_tg_shape.w); + cvk_tg_shape_t _bf16_tg_shape = {(uint32_t)n, (uint32_t)c, (uint32_t)h, (uint32_t)w}; + testbench(&rt_ctx, cvk_ctx, &_bf16_tg_shape); + printf("compare test bf16 to fp32 done\n"); + } + } + } + } + + // de-init runtime / kerenl structure + test_exit(&rt_ctx, cvk_ctx); + + // restore rounding mode + restore_feround(round_mode); + + return 0; +} diff --git a/cvimath/tests/cvi1835/blas_cpu.cpp b/cvimath/tests/cvi1835/blas_cpu.cpp new file mode 100644 index 000000000..d38ce9ca6 --- /dev/null +++ b/cvimath/tests/cvi1835/blas_cpu.cpp @@ -0,0 +1,60 @@ +#include + +#include +#include +#include +#include +#include + +int main() { + srand(time(NULL)); + const uint32_t data_length = 512; + const uint32_t data_num = 20000; + uint8_t *db = new uint8_t[data_num * data_length]; + float *db_unit = new float[data_num]; + uint8_t *data = new uint8_t[data_length]; + float *buffer_f = new float[data_num]; + memset(buffer_f, 0, data_num * sizeof(float)); + + for (uint32_t i = 0; i < data_length; i++) { + data[i] = rand() % 256; + } + for (uint32_t j = 0; j < data_num; j++) { + for (uint32_t i = 0; i < data_length; i++) { + db[j * data_length + i] = rand() % 256; + } + } + cvm_gen_db_unit_length(db, db_unit, data_length, data_num); + + const uint32_t k = 5; + uint32_t k_index[k] = {0}; + float k_value[k] = {0}; + struct timeval t0, t1; + gettimeofday(&t0, NULL); + cvm_cpu_u8data_ip_match(data, db, db_unit, k_index, k_value, buffer_f, data_length, data_num, k); + gettimeofday(&t1, NULL); + unsigned long elapsed_tpu = ((t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec); + printf("Searching time uint8: %lu us\n", elapsed_tpu); + printf("Result:\n"); + for (uint32_t i = 0; i < k; i++) { + printf("[%u] %f\n", k_index[i], k_value[i]); + } + printf("\n"); + gettimeofday(&t0, NULL); + cvm_cpu_i8data_ip_match((int8_t *)data, (int8_t *)db, db_unit, k_index, k_value, buffer_f, + data_length, data_num, k); + gettimeofday(&t1, NULL); + elapsed_tpu = ((t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec); + printf("Searching time int8: %lu us\n", elapsed_tpu); + printf("Result:\n"); + for (uint32_t i = 0; i < k; i++) { + printf("[%u] %f\n", k_index[i], k_value[i]); + } + printf("\n"); + + delete[] data; + delete[] db; + delete[] db_unit; + delete[] buffer_f; + return 0; +} \ No newline at end of file diff --git a/cvimath/tests/cvi1835/blas_tpu.cpp b/cvimath/tests/cvi1835/blas_tpu.cpp new file mode 100644 index 000000000..82be4a7c2 --- /dev/null +++ b/cvimath/tests/cvi1835/blas_tpu.cpp @@ -0,0 +1,134 @@ +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +void i8data_ip_match(CVI_RT_HANDLE ctx, cvk_context_t *cvk_ctx, uint64_t a_gaddr, int8_t *a_vaddr, + uint64_t db_gaddr, float *unit_db_arr, uint32_t *k_index, float *k_value, + uint64_t buffer_gemm_gaddr, uint8_t *buffer_gemm_vaddr, uint32_t *buffer_i32, + float *buffer_f, CVI_RT_MEM gemm_device, const uint32_t data_length, + const uint32_t data_num, const uint32_t k) { + size_t *slice_num = + cvm_gemm(cvk_ctx, a_gaddr, db_gaddr, buffer_gemm_gaddr, 1, data_length, data_num, CVK_FMT_I8); + CVI_RT_Submit(cvk_ctx); + CVI_RT_MemInvld(ctx, gemm_device); + cvm_combin_gemm_i8(slice_num, buffer_gemm_vaddr, buffer_i32, 1, data_num); + free(slice_num); + // Get a length + int32_t dot_result = 0; + for (uint32_t i = 0; i < data_length; i++) { + dot_result += ((short)a_vaddr[i] * a_vaddr[i]); + } + float unit_a = sqrt(dot_result); + // Get a length end + + for (uint32_t i = 0; i < data_num; i++) { + buffer_f[i] = ((int32_t *)buffer_i32)[i] / (unit_a * unit_db_arr[i]); + } + // Get k result + for (uint32_t i = 0; i < k; i++) { + int largest = 0; + for (uint32_t j = 0; j < data_num; j++) { + if (buffer_f[j] > buffer_f[largest]) { + largest = j; + } + } + k_value[i] = buffer_f[largest]; + k_index[i] = largest; + buffer_f[largest] = 0; + } +} + +int main() { + CVI_RT_HANDLE ctx; + CVI_RT_Init(&ctx); + cvk_context_t *bk_ctx = (cvk_context_t *)CVI_RT_RegisterKernel(ctx, 100000); + printf("123\n"); + + const uint32_t data_length = 512; + const uint32_t data_num = 1000; + // Allocate memory + CVI_RT_MEM bmmem_a = CVI_RT_MemAlloc(ctx, data_length); + CVI_RT_MEM bmmem_db = CVI_RT_MemAlloc(ctx, data_length * data_num); + CVI_RT_MEM bmmem_c = CVI_RT_MemAlloc(ctx, data_num * sizeof(uint32_t)); + + uint64_t gaddr_a = CVI_RT_MemGetPAddr(bmmem_a); + uint64_t gaddr_db = CVI_RT_MemGetPAddr(bmmem_db); + uint64_t gaddr_c = CVI_RT_MemGetPAddr(bmmem_c); + + uint8_t *vaddr_a = CVI_RT_MemGetVAddr(bmmem_a); + uint8_t *vaddr_db = CVI_RT_MemGetVAddr(bmmem_db); + uint8_t *vaddr_c = CVI_RT_MemGetVAddr(bmmem_c); + + int8_t *db_raw = new int8_t[data_length * data_num]; + float *db_unit = new float[data_num]; + uint32_t *buffer = new uint32_t[data_num]; + float *buffer_f = new float[data_num]; + + // Generate data + srand(time(NULL)); + for (uint32_t i = 0; i < data_length; i++) { + ((int8_t *)vaddr_a)[i] = rand() % 10 - 10; + } + for (uint32_t j = 0; j < data_num; j++) { + for (uint32_t i = 0; i < data_length; i++) { + ((int8_t *)db_raw)[j * data_length + i] = rand() % 10 - 10; + } + } + + // Pass db feature to ion + for (uint32_t n = 0; n < data_num * data_length; n++) { + int i = n / data_num; + int j = n % data_num; + ((int8_t *)vaddr_db)[n] = db_raw[data_length * j + i]; + } + + // Calculate unit length for db feature + cvm_gen_precached_i8_unit_length((int8_t *)db_raw, db_unit, data_length, data_num); + CVI_RT_MemFlush(ctx, bmmem_a); + CVI_RT_MemFlush(ctx, bmmem_db); + + const uint32_t k = 5; + uint32_t k_index[k] = {0}; + float k_value[k] = {0}; + struct timeval t0, t1; + gettimeofday(&t0, NULL); + i8data_ip_match(ctx, bk_ctx, gaddr_a, (int8_t *)vaddr_a, gaddr_db, db_unit, k_index, k_value, + gaddr_c, vaddr_c, buffer, buffer_f, bmmem_c, data_length, data_num, k); + gettimeofday(&t1, NULL); + unsigned long elapsed_tpu = ((t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec); + printf("Searching time tpu int8: %lu us\n", elapsed_tpu); + printf("Result:\n"); + for (uint32_t i = 0; i < k; i++) { + printf("[%u] %f\n", k_index[i], k_value[i]); + } + printf("\n"); + + gettimeofday(&t0, NULL); + cvm_cpu_i8data_ip_match((int8_t *)vaddr_a, (int8_t *)db_raw, db_unit, k_index, k_value, buffer_f, + data_length, data_num, k); + gettimeofday(&t1, NULL); + elapsed_tpu = ((t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec); + printf("Searching time int8: %lu us\n", elapsed_tpu); + printf("Result:\n"); + for (uint32_t i = 0; i < k; i++) { + printf("[%u] %f\n", k_index[i], k_value[i]); + } + printf("\n"); + + delete[] db_unit; + delete[] buffer; + delete[] buffer_f; + CVI_RT_MemFree(ctx, bmmem_a); + CVI_RT_MemFree(ctx, bmmem_db); + CVI_RT_MemFree(ctx, bmmem_c); + CVI_RT_UnRegisterKernel(bk_ctx); + CVI_RT_DeInit(ctx); + return 0; +} diff --git a/cvimath/tests/cvi1835/depthwise_reshape_same.cpp b/cvimath/tests/cvi1835/depthwise_reshape_same.cpp new file mode 100644 index 000000000..b591e5c30 --- /dev/null +++ b/cvimath/tests/cvi1835/depthwise_reshape_same.cpp @@ -0,0 +1,907 @@ +#include +#include + +#include // calc_dilute_hw + +#define NPU_NUM (1 << 5) +typedef cvk_tiu_depthwise_pt_convolution_param_t param_t; + +int random_seed; +static void print_pooling_param(param_t *p) { + int in = p->ifmap->shape.n; + int ic = p->ifmap->shape.c; + int ih = p->ifmap->shape.h; + int iw = p->ifmap->shape.w; + int kh = p->weight->shape.h; + int kw = p->weight->shape.w; + + printf(" Pooling parameters:\n"); + // printf(" random_seed : %d \n", random_seed); + printf(" ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw); + printf(" opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8); + printf(" weight = (%d, %d)\n", kh, kw); + printf(" padding = (%d, %d, %d, %d)\n", p->pad_top, p->pad_bottom, p->pad_left, p->pad_right); + printf(" stride = (%d, %d)\n", p->stride_h, p->stride_w); + // printf(" ins0 = (%d, %d, %d, %d)\n", + // p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w); + // printf(" dilation = (%d, %d)\n",p->dilation_h, p->dilation_w); + // printf(" rshift_bits = %d\n", p->rshift_bits); + // printf(" relu_enable = %d\n", p->relu_enable); + printf(" res0_sign = %d\n", p->ofmap->fmt == CVK_FMT_I8); +} + +static uint16_t *alloc_input(int ic, int ih, int iw, cvk_fmt_t ifmt) { + uint64_t size = ic * ih * iw; + uint16_t *data = (uint16_t *)new uint16_t[(size)]; + if (ifmt == CVK_FMT_BF16) { + for (uint64_t i = 0; i < size; i++) { + float val = 0; + int RAND_MAX2 = RAND_MAX / 2; // 5 ~ -5 + val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX; + val = i; + data[i] = convert_fp32_bf16(val); + } + } else { + uint8_t *d = (uint8_t *)data; + for (uint64_t i = 0; i < size; i++) { + d[i] = i % 10 * (i % 2 ? -1 : 1); + } + } + + return data; +} + +static uint16_t *alloc_weight(int ic, int kh, int kw, cvk_fmt_t fmt) { + int size = ic * kh * kw; + uint16_t *data = (uint16_t *)malloc(size * sizeof(uint16_t)); + // printf("weight size is %d\n", size * 2); + if (fmt == CVK_FMT_BF16) { + for (int i = 0; i < size; i++) { + float val = 0; + int RAND_MAX2 = RAND_MAX / 2; // 5 ~ -5 + val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX; + val = i; + data[i] = convert_fp32_bf16(val); + } + } else { + uint8_t *d = (uint8_t *)data; + for (int i = 0; i < size; i++) { + d[i] = i % 5 * (i % 2 ? -1 : 1); + } + } + return data; +} + +static uint32_t *alloc_bias(int ic, cvk_fmt_t fmt) { + int c = ic; + uint64_t size = c; + uint32_t *bias = (uint32_t *)malloc(sizeof(uint32_t) * c); + if (fmt == CVK_FMT_BF16) { + for (int i = 0; i < c; i++) { + float val = 0; + int RAND_MAX2 = RAND_MAX / 2; // 2 ~ -2 + val = (float)(rand() - RAND_MAX2) * 2 / (float)RAND_MAX; + val = i; + bias[i] = convert_fp32_hex(val); + } + } else { + uint16_t *d = (uint16_t *)bias; + for (uint64_t i = 0; i < size; i++) { + d[i] = i % 0xf * (i % 2 ? -1 : 1); + } + } + return bias; +} + +static uint16_t *alloc_output(int ic, int oh, int ow) { + uint64_t size = ic * oh * ow; + return (uint16_t *)new uint16_t[(size)]; +} + +static inline void cvm_relu(uint16_t *buf, uint64_t size, cvk_fmt_t fmt) { + if (fmt == CVK_FMT_BF16) { + for (uint64_t i = 0; i < size; i++) + if (convert_bf16_fp32(buf[i]) < 0) buf[i] = convert_fp32_bf16(0); + } else { + int8_t *buf_int8_t = (int8_t *)buf; + for (uint64_t i = 0; i < size; i++) { + if (buf_int8_t[i] < 0) buf_int8_t[i] = 0; + } + } +} + +static int index_get(int h, int w1, int w2) { return h * w1 + w2; } + +int native_pooling_avg_bf16(const uint16_t *i_fmap, const void *weight, const uint32_t *bias, + uint16_t *o_fmap, int input_n, int input_c, int input_h, int input_w, + int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, + int stride_h, int stride_w, int ins_h, int ins_w, int ins_h_last, + int ins_w_last, int dh, int dw, int const_weight) { + if (kh * kw <= 0) return BM_ERR_INVALID_ARGUMENT; + + uint16_t avg_const_weight = *(uint16_t *)weight; + uint16_t *weight_arr = (uint16_t *)weight; + int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b); + int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r); + int d_kh = calc_dilute_hw(kh, dh - 1, 0, 0, 0); + int d_kw = calc_dilute_hw(kw, dw - 1, 0, 0, 0); + + int output_h = calc_output_hw(h_after, d_kh, stride_h); + int output_w = calc_output_hw(w_after, d_kw, stride_w); + // printf("output_h/output_w is %d/%d\n", output_h, output_w); + float *avg_pooling_mac_a = (float *)malloc(d_kh * d_kw * sizeof(float)); + float *avg_pooling_mac_b = (float *)malloc(d_kh * d_kw * sizeof(float)); + + uint16_t *i_fmap_pad = NULL; + uint16_t *i_kmap_pad = NULL; + for (int n = 0; n < input_n; n++) { + if (const_weight == 0) weight_arr = (uint16_t *)weight; + + for (int c = 0; c < input_c; ++c) { + fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, 0, pad_w_l, pad_w_r, pad_h_t, pad_h_b, ins_h, ins_w, + ins_h_last, ins_w_last, input_h, input_w); + + // kernel_dilation( + if (const_weight == 0) + fill_pad_fmap_bf16((weight_arr), &i_kmap_pad, 0, 0, 0, 0, + 0, // no padding + dh - 1, dw - 1, 0, 0, kh, kw); + + float avg_pool_result; + for (int ph = 0; ph < output_h; ++ph) { + for (int pw = 0; pw < output_w; ++pw) { + int hstart = ph * stride_h; + int wstart = pw * stride_w; + int pool_index = index_get(ph, output_w, pw); + int mac_index = 0; + + float r = 0; + for (int h = 0; h < d_kh; h++) { + for (int w = 0; w < d_kw; w++) { + int index = index_get((hstart + h), w_after, (w + wstart)); + mac_index = h * d_kw + w; + + avg_pooling_mac_a[mac_index] = convert_bf16_fp32(i_fmap_pad[index]); + + avg_pooling_mac_b[h * d_kw + w] = const_weight + ? convert_bf16_fp32(avg_const_weight) + : convert_bf16_fp32(i_kmap_pad[mac_index]); + +#if 0 + printf ("ref[ni %u][ci %u][oh/ow %u/%u][kh/kw %u/%u] o[%d]" + " %.1f * %.1f + %.1f = %.1f\n", + n, c, ph, pw, h, w, pool_index, + avg_pooling_mac_a[mac_index], avg_pooling_mac_b[h*d_kw+w], + r, r + avg_pooling_mac_a[mac_index] * avg_pooling_mac_b[h*d_kw+w]); +#endif + + r += avg_pooling_mac_a[mac_index] * avg_pooling_mac_b[h * d_kw + w]; + } + } + + inner_float_product(avg_pooling_mac_a, avg_pooling_mac_b, d_kh * d_kw, &avg_pool_result); + + if (bias) { + avg_pool_result += convert_hex_fp32(bias[c]); + } + *(o_fmap + pool_index) = convert_fp32_bf16(avg_pool_result); + } + } + weight_arr += kh * kw; + i_fmap += input_w * input_h; + o_fmap += output_w * output_h; + } + } + free(i_fmap_pad); + free(i_kmap_pad); + free(avg_pooling_mac_a); + free(avg_pooling_mac_b); + + return BM_SUCCESS; +} + +static int get_fsz(cvk_fmt_t fmt) { + assert(fmt == CVK_FMT_BF16 || fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8); + return fmt == CVK_FMT_BF16 ? 2 : 1; +} + +static void compare_results(param_t *p, uint16_t input[], uint16_t weight[], uint32_t bias[], + uint16_t output[], uint16_t output_ref[], uint32_t org_o_shape_size, + int is_valid_pack, int org_oc, int org_oh, int org_ow) { + assert(input); + assert(weight); + (void)input; + (void)weight; + printf("bias at %p\n", bias); + int f_sz = get_fsz(p->ofmap->fmt); + + if (p->relu_enable) { + cvm_relu(output_ref, org_o_shape_size, p->ofmap->fmt); + } + + int cmp_res = -1; + if (!is_valid_pack) { + // we reshape c with SAME mode padding with garbage + // \is_valid_pack set to false means we skip garbage part + int org_hw = org_oh * org_ow; + int new_hw = p->ofmap->shape.h * p->ofmap->shape.w; + int duplicated_c = p->ofmap->shape.c / org_oc; + + assert(new_hw >= org_hw / duplicated_c); + + int8_t *output_c = ((int8_t *)output); + int8_t *output_ref_c = ((int8_t *)output_ref); + for (int c = 0; c < org_oc; c++) { + cmp_res = + array_cmp_int8("Comparing results ...\n", output_c + c * duplicated_c * new_hw * f_sz, + output_ref_c + org_hw * c * f_sz, org_hw * f_sz); + + if (cmp_res != 0) { + break; + } + // printf("compare [%d] pass, org len is %u, new len is %u\n", c, + // org_hw, duplicated_c * new_hw); + } + } else { + cmp_res = array_cmp_int8("Comparing results ...\n", (int8_t *)output_ref, (int8_t *)output, + org_o_shape_size * f_sz); + } + if (cmp_res != 0) { + printf("Comparison FAILED!!!\n"); + // print_pooling_param(p); + exit(-1); + } + + delete[] output_ref; +} + +static int pooling_ih_ext(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int ih) { + int ins = ins_h; + int ins_last = ins_last_h; + int pad = pad_top + pad_bottom; + return (ih - 1) * (ins + 1) + ins_last + 1 + pad; +} + +static int pooling_iw_ext(int ins_w, int ins_last_w, int pad_left, int pad_right, int iw) { + int ins = ins_w; + int ins_last = ins_last_w; + int pad = pad_left + pad_right; + return (iw - 1) * (ins + 1) + ins_last + 1 + pad; +} + +static int pooling_oh(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int stride_h, int ih, + int kh, int dh) { + int ih_ext = pooling_ih_ext(ins_h, ins_last_h, pad_top, pad_bottom, ih); + int d_h = (kh - 1) * dh + 1; + return (ih_ext - d_h) / stride_h + 1; +} + +static int pooling_ow(int ins_w, int ins_last_w, int pad_left, int pad_right, int stride_w, int iw, + int kw, int dw) { + int iw_ext = pooling_iw_ext(ins_w, ins_last_w, pad_left, pad_right, iw); + int d_w = (kw - 1) * dw + 1; + return (iw_ext - d_w) / stride_w + 1; +} + +static void free_depthwise_struct(param_t *p) { + free((void *)p->ofmap); + free((void *)p->ifmap); + free((void *)p->weight); + if (p->bias) { + free((void *)p->bias); + } + + p->ofmap = NULL; + p->ifmap = NULL; + p->weight = NULL; + p->bias = NULL; +} + +static void free_depthwise_param(cvk_context_t *ctx, param_t *p) { + if (p->ofmap) free_tl(ctx, p->ofmap); + + if (p->weight) free_tl(ctx, p->weight); + + if (p->bias) free_tl(ctx, p->bias); + + if (p->ifmap) free_tl(ctx, p->ifmap); +} + +static param_t random_depthwise_param(cvk_context_t *ctx, int _ih, int _iw, int _stride_h, + cvk_fmt_t _fmt) { + param_t p; + + // retry: + random_seed = clock(); + srand(random_seed); + int using_bias = rand() % 2; + int n = rand() % 5 + 1; + n = 1; + int c = rand() % (3 * NPU_NUM) + 1; + c = 3; + int ih = rand() % 30 + 3; + int iw = rand() % 30 + 6; + int kh = rand() % 7 + 1; + int kw = rand() % 7 + 1; + + p.ins_h = rand() % kh; + p.ins_w = rand() % kw; + p.ins_last_h = rand() % kh; + p.ins_last_w = rand() % kw; + p.stride_h = rand() % kh + 1; + p.stride_w = rand() % kw + 1; + p.pad_top = rand() % kh; + p.pad_bottom = rand() % kh; + p.pad_left = rand() % kw; + p.pad_right = rand() % kw; + p.rshift_bits = rand() % 32; + p.dilation_h = rand() % 4 + 1; + p.dilation_w = rand() % 4 + 1; + + // default + cvk_fmt_t ifmt = CVK_FMT_BF16; + cvk_fmt_t other_fmt = CVK_FMT_BF16; + ih = 24; + iw = 16; + kw = 5; + kh = 5; + p.stride_h = 1; + p.stride_w = 1; + + p.rshift_bits = 0; + + ih = _ih; + p.stride_h = _stride_h; + iw = _iw; + ifmt = _fmt; + other_fmt = CVK_FMT_I8; + if (ifmt != CVK_FMT_BF16) { + } else { + other_fmt = CVK_FMT_BF16; + } + + p.pad_left = 2; + p.pad_right = 2; + p.pad_top = 0; + p.pad_bottom = 0; + // TODO: pad / ins / dilation + p.ins_h = 0; + p.ins_last_h = 0; + p.ins_w = 0; + p.ins_last_w = 0; + p.dilation_h = 1; + p.dilation_w = 1; + + int oh = + pooling_oh(p.ins_h, p.ins_last_h, p.pad_top, p.pad_bottom, p.stride_h, ih, kh, p.dilation_h); + int ow = + pooling_ow(p.ins_w, p.ins_last_w, p.pad_left, p.pad_right, p.stride_w, iw, kw, p.dilation_w); + + cvk_tl_shape_t ofmap_shape; + ofmap_shape.n = n; + ofmap_shape.c = c; + ofmap_shape.h = oh; + ofmap_shape.w = ow; + cvk_tl_shape_t ifmap_shape; + ifmap_shape.n = n; + ifmap_shape.c = c; + ifmap_shape.h = ih; + ifmap_shape.w = iw; + cvk_tl_shape_t weight_shape; + weight_shape.n = 1; + weight_shape.c = c; + weight_shape.h = kh; + weight_shape.w = kw; + cvk_tl_shape_t bias_shape; + bias_shape.n = 2; + bias_shape.c = c; + bias_shape.h = 1; + bias_shape.w = 1; + p.relu_enable = rand() % 2; + + // fake init for ref + cvk_tl_t *bias, *weight, *ofmap, *ifmap; + ifmap = (cvk_tl_t *)malloc(sizeof(cvk_tl_t)); + if (using_bias) { + bias = (cvk_tl_t *)malloc(sizeof(cvk_tl_t)); + } + weight = (cvk_tl_t *)malloc(sizeof(cvk_tl_t)); + ofmap = (cvk_tl_t *)malloc(sizeof(cvk_tl_t)); + + p.bias = NULL; + if (using_bias) { + bias->start_address = -1; + bias->fmt = other_fmt; + bias->shape = bias_shape; + bias->stride = ctx->ops->tl_default_stride(ctx, bias->shape, other_fmt, /*eu_align*/ 0); + p.bias = bias; + } + + weight->start_address = -1; + weight->fmt = other_fmt; + weight->shape = weight_shape; + weight->stride = ctx->ops->tl_default_stride(ctx, weight->shape, other_fmt, /*align*/ 1); + p.weight = weight; + + ofmap->start_address = -1; + ofmap->fmt = other_fmt; + ofmap->shape = ofmap_shape; + ofmap->stride = ctx->ops->tl_default_stride(ctx, ofmap->shape, other_fmt, /*align*/ 1); + p.ofmap = ofmap; + + ifmap->start_address = -1; + ifmap->fmt = ifmt; + ifmap->shape = ifmap_shape; + ifmap->stride = ctx->ops->tl_default_stride(ctx, ifmap->shape, ifmt, /*align*/ 1); + p.ifmap = ifmap; + +#if 0 + int d_kh = calc_dilute_hw(kh, p.dilation_h - 1, 0, 0, 0); + int d_kw = calc_dilute_hw(kw, p.dilation_w - 1, 0, 0, 0); + if ((kh > pooling_ih_ext(&p, ih)) + || (kw > pooling_iw_ext(&p, iw)) + || (oh < d_kh) + || (ow < d_kw) + || (p.pad_top >= (1 << 4)) + || (p.pad_bottom >= (1 << 4)) + || (p.pad_left >= (1 << 4)) + || (p.pad_right >= (1 << 4)) + || !p.ofmap + || !p.ifmap + || !p.weight + || (using_bias && !p.bias) +) { + LOG(INFO) << "retry init_pooling_param"; + assert(0 && "it MUST valid param pass"); + goto retry; + } +#endif + return p; +} + +static void put_bias_tensor(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx, const cvk_tl_t *tl, + uint32_t data[]) { + int c = tl->shape.c; + + uint16_t *hi_lo = (uint16_t *)malloc(sizeof(uint16_t) * 2 * c); + if (tl->fmt == CVK_FMT_BF16) { + for (int i = 0; i < c; i++) { + hi_lo[i] = (data[i] >> 16) & 0xffff; + hi_lo[i + c] = (data[i] & 0xffff); + } + } else { + uint8_t *hi_lo_uint8_t = (uint8_t *)hi_lo; + uint16_t *data_uint16_t = (uint16_t *)data; + for (int i = 0; i < c; i++) { + hi_lo_uint8_t[i] = data_uint16_t[i] & 0xff; + hi_lo_uint8_t[i + c] = (data_uint16_t[i] >> 8) & 0xff; + } + } + put_bf16_tensor_g2l(ctx, bk_ctx, tl, (uint16_t *)hi_lo, tl->fmt); + + free(hi_lo); +} + +/** + * \brief + */ +static int reshape_valid_output(cvk_context_t *bk_ctx, const cvk_tl_t *ofmap, int org_oc, + int org_oh, int org_ow, cvk_tl_shape_t *tl_shape, + cvk_tl_stride_t *tl_load_stride, cvk_tg_shape_t *tg_shape, + cvk_tg_stride_t *tg_stride, cvk_fmt_t fmt) { + assert(fmt == CVK_FMT_BF16 || fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8); + + // skip redundant one + // store to sys and re-slice, maybe use next layer + // sys->local skip redundant one + + tg_shape->n = tl_shape->n = 1; + tg_shape->c = tl_shape->c = org_oc; + tg_shape->h = tl_shape->h = org_oh; + tg_shape->w = tl_shape->w = org_ow; + + cvk_tl_stride_t s = bk_ctx->ops->tl_default_stride(bk_ctx, *tl_shape, fmt, /*eu_align*/ 0); + + tl_load_stride->n = s.n; + tl_load_stride->c = s.c; + tl_load_stride->h = s.h; + tl_load_stride->w = s.w; + + int duplicat_c = ofmap->shape.c / org_oc; + tg_stride->n = tg_stride->c = duplicat_c * ofmap->shape.h * ofmap->shape.w * get_fsz(fmt); + tg_stride->h = org_ow * get_fsz(fmt); + + return 0; +} + +static bmerr_t init_ref(int ic, int ih, int iw, int kh, int kw, int pad_right, int pad_left, + int stride_h, int stride_w, cvk_fmt_t fmt, uint16_t *input, + uint16_t *weight, uint32_t *bias, uint16_t *output_ref) { + bmerr_t ret; + int in = 1; + int ins_h = 0; + int ins_w = 0; + int ins_last_h = 0; + int ins_last_w = 0; + int dilation_h = 1; + int dilation_w = 1; + int pad_top = 0; + int pad_bottom = 0; + int rshift_bits = 0; + + if (fmt == CVK_FMT_BF16) { + ret = native_pooling_avg_bf16(input, weight, bias ? bias : NULL, output_ref, in, ic, ih, iw, kh, + kw, pad_top, pad_bottom, pad_left, pad_right, stride_h, stride_w, + ins_h, ins_w, ins_last_h, ins_last_w, dilation_h, dilation_w, 0); + } else { + int opd0_sign = fmt == CVK_FMT_I8; + int res0_sign = true; //(ofmap->fmt == CVK_FMT_I8); + ret = native_pooling_ave_int8((int8_t *)input, (int8_t *)weight, bias ? (int16_t *)bias : NULL, + (int8_t *)output_ref, in, ic, ih, iw, kh, kw, pad_top, pad_bottom, + pad_left, pad_right, stride_h, stride_w, ins_h, ins_w, ins_last_h, + ins_last_w, opd0_sign, res0_sign, rshift_bits, 0); + } + return ret; +} + +static int test_depthwise(CVI_RT_HANDLE ctx, cvk_context_t *bk_ctx, int ic, int ih, int iw, int kh, + int kw, int pad_right, int pad_left, int stride_h, int stride_w, + bool has_bias, cvk_fmt_t ifmt) { + // print_pooling_param(param); + param_t param; + param_t *p = ¶m; + assert(ifmt == CVK_FMT_BF16 || ifmt == CVK_FMT_I8 || ifmt == CVK_FMT_U8); + + int in = 1; + // TODO: verify dialate > 1 + int dilation_h = 1; + int dilation_w = 1; + int relu_enable = 0; + int rshift_bits = 0; + + // TODO: verity ins_x + int org_oh = pooling_oh(0, 0, 0, 0, stride_h, ih, kh, dilation_h); + int org_ow = pooling_ow(0, 0, pad_left, pad_right, stride_w, iw, kw, dilation_w); + int org_oc = ic; + int org_o_shape_size = in * org_oc * org_oh * org_ow; + uint16_t *output; + cvk_tdma_g2l_tensor_copy_param_t p1; + cvk_tdma_l2g_tensor_copy_param_t p2; + // weight / ofmap not support U8 format + cvk_fmt_t other_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8; + + // alloc testbench, input/ref + uint16_t *input = alloc_input(ic, ih, iw, ifmt); + uint16_t *weight = alloc_weight(ic, kh, kw, ifmt); + uint32_t *bias = NULL; + if (has_bias) bias = alloc_bias(ic, ifmt); + + uint16_t *output_ref = alloc_output(ic, org_oh, org_ow); + + // init ref + init_ref(ic, ih, iw, kh, kw, pad_right, pad_left, stride_h, stride_w, ifmt, input, weight, bias, + output_ref); + // assert(ret == BM_SUCCESS); + + // init param + // TODO: verify pad_top/pad_bottom + // TODO: verify ins_h_x + p->pad_left = pad_left; + p->pad_right = pad_right; + p->pad_top = 0; + p->pad_bottom = 0; + p->ins_h = 0; + p->ins_last_h = 0; + p->ins_w = 0; + p->ins_last_w = 0; + p->dilation_h = dilation_h; + p->dilation_w = dilation_w; + p->stride_h = stride_h; + p->stride_w = stride_w; + + p->relu_enable = relu_enable; + p->rshift_bits = rshift_bits; + p->bias = NULL; + + // prepard load / input / weight / bias / output new shape / stride + cvk_tl_shape_t tl_load_shape; + cvk_tl_stride_t tl_load_stride; + cvk_tg_shape_t tg_shape; + cvk_tg_stride_t tg_stride; + cvk_tl_shape_t tl_weight_shape; + cvk_tl_shape_t tl_bias_shape; + cvk_tl_shape_t tl_output_shape; + cvk_tl_t *tmp_tl_load; + cvk_tg_t *tmp_tg; + + // get reshaped information + int r = cvm_reshape_channel_same(bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, stride_h, + stride_w, &tl_load_shape, &tl_load_stride, &tg_shape, &tg_stride, + &tl_weight_shape, &tl_bias_shape, &tl_output_shape, ifmt, + /*align*/ 1); + + if (r == -1) { + printf("could not reshape it, 81\n"); + free_depthwise_param(bk_ctx, p); + + delete[] input; + free(weight); + free(bias); + return -1; + } + + // prepare input tg + { + cvk_tg_shape_t put_tg_shape; + + put_tg_shape.n = in; + put_tg_shape.c = ic; + put_tg_shape.h = ih; + put_tg_shape.w = iw; + cvk_tg_t *put_tg = alloc_tg_bf16_gmem(&ctx, bk_ctx, put_tg_shape, ifmt); + put_tg_bf16_gmem(&ctx, put_tg, (uint8_t *)input); + free_tg_gmem(&ctx, put_tg); + } + + // prepare load input, put to tg and load back + { + tmp_tl_load = alloc_tl_bf16(bk_ctx, tl_load_shape, ifmt, /*eu_align*/ 0); + assert(tmp_tl_load); + + tmp_tg = alloc_tg_bf16_gmem(&ctx, bk_ctx, tg_shape, ifmt); + tmp_tg->stride = tg_stride; + + p1.src = tmp_tg; + p1.dst = tmp_tl_load; + + bk_ctx->ops->tdma_g2l_bf16_tensor_copy(bk_ctx, &p1); + test_submit_comp(&ctx, bk_ctx); + free_tg_gmem(&ctx, tmp_tg); + + // fit for hw + tmp_tl_load->stride = + bk_ctx->ops->tl_default_stride(bk_ctx, tmp_tl_load->shape, ifmt, /*align*/ 1); + p->ifmap = tmp_tl_load; + } + + // prepare load bias, put to tg and load back + if (has_bias) { + // bias must i8 + cvk_fmt_t bias_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8; + p->bias = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_bias_shape, bias_fmt, 0); + + // duplicate bias and replace old + uint32_t *new_bias = cvm_reshape_channel_bias((uint8_t *)bias, tl_bias_shape.n, tl_bias_shape.c, + tl_bias_shape.h, tl_bias_shape.w, org_oc, ifmt); + + // free old one + free(bias); + bias = new_bias; + put_bias_tensor(&ctx, bk_ctx, p->bias, bias); + } + + // prepare load weight, put to tg and load back + { + p->weight = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_weight_shape, other_fmt, /*align*/ 1); + assert(p->weight); + + // duplicate kernel with c + uint8_t *new_weight = + cvm_reshape_channel_weight((uint8_t *)weight, tl_weight_shape.n, tl_weight_shape.c, + tl_weight_shape.h, tl_weight_shape.w, org_oc, ifmt); + + // free old one + free(weight); + weight = (uint16_t *)new_weight; + put_bf16_tensor_g2l(&ctx, bk_ctx, p->weight, (uint16_t *)weight, ifmt); + } + + // prepard ofmap + { + // we allocate 'same' mode shape + p->ofmap = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_output_shape, other_fmt, /*align*/ 1); + assert(p->ofmap); + } + + // printf("p->ifmap at %p, c is %d\n", p->ifmap, tmp_tl_load->shape.c); + + // emit + if (ifmt == CVK_FMT_BF16) { + bk_ctx->ops->tiu_pt_depthwise_convolution(bk_ctx, p); + } else { + bk_ctx->ops->tiu_pt_depthwise_convolution(bk_ctx, p); + } + + // output = (uint16_t *)get_bf16_tensor_l2g(&ctx, bk_ctx, p->ofmap, ifmt); + + // check with no pad if true + int is_valid_pack = false; + cvk_tl_shape_t r_ofmap_shape; + cvk_tl_stride_t r_ofmap_stride; + cvk_tg_shape_t r_tg_shape; + cvk_tg_stride_t r_tg_stride; + + reshape_valid_output(bk_ctx, p->ofmap, org_oc, org_oh, org_ow, &r_ofmap_shape, &r_ofmap_stride, + &r_tg_shape, &r_tg_stride, ifmt); + + p1.dst = p->ofmap; + + if (is_valid_pack) { + cvk_tg_shape_t dst_shape; + dst_shape.n = p->ofmap->shape.n; + dst_shape.c = p->ofmap->shape.c; + dst_shape.h = p->ofmap->shape.h; + dst_shape.w = p->ofmap->shape.w; + cvk_tg_t *cvk_tg_tmp = alloc_tg_bf16_gmem(&ctx, bk_ctx, dst_shape, ifmt); + + p2.src = p->ofmap; + p2.dst = cvk_tg_tmp; + + // store for later reshape + bk_ctx->ops->tdma_l2g_bf16_tensor_copy(bk_ctx, &p2); + test_submit_comp(&ctx, bk_ctx); + + // free useless for later reallocate + free_depthwise_param(bk_ctx, p); + + p->ofmap = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, r_ofmap_shape, ifmt, + /*eu_align*/ 0); + assert(p->ofmap); + + cvk_tg_tmp->shape = r_tg_shape; + cvk_tg_tmp->stride = r_tg_stride; + + p1.src = cvk_tg_tmp; + p1.dst = p->ofmap; + bk_ctx->ops->tdma_g2l_bf16_tensor_copy(bk_ctx, &p1); + free_tg_gmem(&ctx, cvk_tg_tmp); + } + + cvk_fmt_t ofmap_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8; + output = (uint16_t *)get_bf16_tensor_l2g(&ctx, bk_ctx, p1.dst, ofmap_fmt); + compare_results(p, input, weight, bias, output, output_ref, org_o_shape_size, is_valid_pack, + org_oc, org_oh, org_ow); + + // free resource + if (is_valid_pack) { + free_tl(bk_ctx, p->ofmap); + } else { + free_depthwise_param(bk_ctx, p); + } + + delete[] input; + free(weight); + free(bias); + free(output); + + return 1; +} + +static void init_input(param_t *p, int *ic, int *ih, int *iw, int *kh, int *kw, int *pad_right, + int *pad_left) { + *ic = p->ifmap->shape.c; + *ih = p->ifmap->shape.h; + *iw = p->ifmap->shape.w; + *kh = p->weight->shape.h; + *kw = p->weight->shape.w; + *pad_right = p->pad_right; + *pad_left = p->pad_left; +} + +static int test_depthwise_pooling(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx) { + int loop = 1; + int test_finished_num = 0; + int ihs[] = {24, 96, 120, 480, 0}; + int iws[] = {16, 17, 19, 23, 128, 256, 0}; + int stride_hs[] = {3, 4, 0}; + cvk_fmt_t formats[] = {CVK_FMT_I8, CVK_FMT_U8, CVK_FMT_BF16, CVK_FMT_F32}; + int ic, ih, iw, kh, kw, pad_right, pad_left; + cvk_fmt_t ifmt; + param_t param; + assert(print_pooling_param); + + ifmt = CVK_FMT_U8; + param = random_depthwise_param(bk_ctx, 210, 640, 1, ifmt); + init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); + test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, + param.stride_h, param.stride_w, param.bias, ifmt); + print_pooling_param(¶m); + free_depthwise_struct(¶m); + +#if 1 + param = random_depthwise_param(bk_ctx, 36, 11, 3, ifmt); + init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); + test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, + param.stride_h, param.stride_w, param.bias, ifmt); + print_pooling_param(¶m); + free_depthwise_struct(¶m); + + ifmt = CVK_FMT_U8; + param = random_depthwise_param(bk_ctx, 24, 29, 3, ifmt); + init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); + free_depthwise_struct(¶m); + test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, + param.stride_h, param.stride_w, param.bias, ifmt); + + ifmt = CVK_FMT_BF16; + param = random_depthwise_param(bk_ctx, 480, 53, 3, ifmt); + init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); + free_depthwise_struct(¶m); + test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, + param.stride_h, param.stride_w, param.bias, ifmt); + + ifmt = CVK_FMT_I8; + param = random_depthwise_param(bk_ctx, 480, 61, 3, ifmt); + init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); + free_depthwise_struct(¶m); + test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, + param.stride_h, param.stride_w, param.bias, ifmt); + + ifmt = CVK_FMT_U8; + param = random_depthwise_param(bk_ctx, 24, 17, 3, ifmt); + init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); + free_depthwise_struct(¶m); + test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, + param.stride_h, param.stride_w, param.bias, ifmt); + + ifmt = CVK_FMT_BF16; + param = random_depthwise_param(bk_ctx, 48, 65, 3, ifmt); + init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); + free_depthwise_struct(¶m); + test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, + param.stride_h, param.stride_w, param.bias, ifmt); + + ifmt = CVK_FMT_I8; + param = random_depthwise_param(bk_ctx, 48, 63, 3, ifmt); + init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); + free_depthwise_struct(¶m); + test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, + param.stride_h, param.stride_w, param.bias, ifmt); +#endif + + for (int i = 0; i < loop; i++) { + for (int i = 0; ihs[i] != 0; i++) { + for (int j = 0; iws[j] != 0; j++) { + for (int k = 0; stride_hs[k] != 0; k++) { + for (int l = 0; formats[l] != 0; l++) { + continue; + if (ihs[i] >= 480 && formats[l] == CVK_FMT_BF16) { + continue; + } + param = random_depthwise_param(bk_ctx, ihs[i], iws[j], stride_hs[k], formats[l]); + ifmt = formats[l]; + printf("test[%d] ih/iw/sh/fmt is {%d, %d, %d, %d}\n", test_finished_num, ihs[i], iws[j], + stride_hs[k], formats[l]); + + init_input(¶m, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left); + free_depthwise_struct(¶m); + int r = test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, + param.stride_h, param.stride_w, param.bias, ifmt); + test_finished_num += r; + } + } + } + } + } + printf("Test finished %u\n", test_finished_num); + + return test_finished_num; +} + +int main() { + CVI_RT_HANDLE ctx; + cvk_context_t *bk_ctx; + + test_init(&ctx, &bk_ctx); + + int round_mode; + round_mode = set_store_feround(); + int ret = test_depthwise_pooling(&ctx, bk_ctx); + assert(ret >= 0); + (void)ret; + printf("pass\n"); + + test_exit(&ctx, bk_ctx); + restore_feround(round_mode); + return 0; +} diff --git a/cvimath/tests/cvi1835/fp32_bf16.cpp b/cvimath/tests/cvi1835/fp32_bf16.cpp new file mode 100644 index 000000000..477d8402b --- /dev/null +++ b/cvimath/tests/cvi1835/fp32_bf16.cpp @@ -0,0 +1,127 @@ +#include +#include +#include + +typedef cvk_tdma_g2g_tensor_copy_param_t param_t; + +static void __print_param(const char *tag, FILE *f, param_t *p) { + fprintf(f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n", tag, p->src->shape.n, p->src->shape.c, + p->src->shape.h, p->src->shape.w, p->dst->shape.n, p->dst->shape.c, p->dst->shape.h, + p->dst->shape.w); +} + +#define print_param(f, p) __print_param(__func__, f, p) + +typedef struct { + cvk_tg_shape_t src_shape; + cvk_tg_shape_t dst_shape; +} case_t; + +static cvk_fmt_type input_fmt[] = { + {CVK_FMT_BF16, CVK_FMT_BF16}, +}; + +static case_t g_cases[] = { + { + {1, 3, 3, 2}, + {1, 3, 3, 2}, + }, + { + {4, 3, 3, 2}, + {4, 3, 3, 2}, + }, + + //{ + // // YOLOv2 concat layer + // {1, 256, 19, 19}, + // {1, 256, 19, 19}, + //}, + { + {1, 256, 19, 20}, + {1, 256, 19, 20}, + }, + { + {1, 1280, 3, 4}, + {1, 1280, 3, 4}, + }, + { + {1, 159 * 89, 36, 4}, + {1, 159 * 89, 36, 4}, + }, + { + {159, 89, 36, 4}, + {159, 89, 36, 4}, + }, +}; + +static void test_param_g2g(CVI_RT_HANDLE *ctx, cvk_context_t *bmk, param_t *p) { + print_param(stderr, p); + + // 2 means source is fp32, occupy 2 * bf16 size + uint64_t size = p->src->shape.n * p->src->shape.c * p->src->shape.h * p->src->shape.w / 2; + uint32_t *src_data = new uint32_t[size]; + for (uint64_t i = 0; i < size; i++) { + src_data[i] = ((0x1234 + i) << 16) + 0x5678 + i; + // printf("src[%lu] 0x%x\n", i, src_data[i]); + } + + test_put_tg_mem_comp(ctx, p->src, (uint8_t *)src_data); + + cvm_s2s_fp32_bf16(bmk, p->src->start_address, p->src->shape, p->dst->start_address, p->dst->shape, + CVK_FMT_BF16); + + long elapsed; + struct timeval t0, t1; + gettimeofday(&t0, NULL); + + test_submit_comp(ctx, bmk); + + gettimeofday(&t1, NULL); + elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; + printf("kernel takes %ld us\n", elapsed); + + uint16_t *dst_data = (uint16_t *)test_get_tg_mem_comp(ctx, p->dst); + + for (uint64_t i = 0; i < size; i++) { + uint16_t _src_data = (src_data[i] >> 16) & 0xffff; + if (dst_data[i] != _src_data) { + fprintf(stderr, "comparing failed at dst[%lu], got %x, exp %x\n", i, dst_data[i], _src_data); + exit(-1); + } + } + + delete[] src_data; + free(dst_data); +} + +static void destroy_param_g2g(CVI_RT_HANDLE *ctx, param_t *p) { + test_free_tg_mem_comp(ctx, p->src); + test_free_tg_mem_comp(ctx, p->dst); +} + +static void test_one_case(CVI_RT_HANDLE *ctx, cvk_context_t *bmk, case_t *c) { + uint32_t nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]); + for (uint32_t i = 0; i < nr_fmt; i++) { + param_t p; + cvk_tg_t *src, *dst; + src = test_alloc_tg_mem_comp(ctx, bmk, c->src_shape, input_fmt[i].src_fmt); + dst = test_alloc_tg_mem_comp(ctx, bmk, c->dst_shape, input_fmt[i].dst_fmt); + p.src = src; + p.dst = dst; + test_param_g2g(ctx, bmk, &p); + destroy_param_g2g(ctx, &p); + } +} + +int main() { + CVI_RT_HANDLE ctx; + cvk_context_t *bmk; + + test_init(&ctx, &bmk); + + uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]); + for (uint32_t i = 0; i < nr_cases; i++) test_one_case(&ctx, bmk, &g_cases[i]); + + test_exit(&ctx, bmk); + return 0; +} diff --git a/cvimath/tests/cvi1835/gemm.cpp b/cvimath/tests/cvi1835/gemm.cpp new file mode 100644 index 000000000..82254b171 --- /dev/null +++ b/cvimath/tests/cvi1835/gemm.cpp @@ -0,0 +1,845 @@ +#include +#include +#include +#include // clock + +typedef cvk_tiu_matrix_multiplication_param_t param_t; +int random_seed; + +static uint64_t matrix_size(const cvk_ml_t *ml) { + uint64_t row = ml->shape.n; + uint64_t col = ml->shape.col; + return row * col; +} + +static uint64_t res_size(param_t *p) { return matrix_size(p->res); } + +static uint16_t *alloc_left(param_t *p) { + uint64_t size = matrix_size(p->left); + uint16_t *buf = new uint16_t[size]; + for (uint64_t i = 0; i < size; i++) { + buf[i] = convert_fp32_bf16(i); + } + + return buf; +} + +static uint16_t *alloc_right(param_t *p) { + uint64_t size = matrix_size(p->right); + uint16_t *buf = new uint16_t[size]; + for (uint64_t i = 0; i < size; i++) { + float val = 0.01; + buf[i] = convert_fp32_bf16(i); + val += 0.01; + } + return buf; +} + +static uint32_t *alloc_bias(param_t *p) { + if (!p->bias) return NULL; + + uint64_t size = matrix_size(p->bias); + uint32_t *buf = new uint32_t[size]; + for (uint64_t i = 0; i < size; i++) { + buf[i] = convert_fp32_hex(i); + } + return buf; +} + +static uint32_t *alloc_res(param_t *p) { + uint64_t size = res_size(p); + uint32_t *buf = new uint32_t[size]; + for (uint64_t i = 0; i < size; i++) { + buf[i] = convert_fp32_bf16(i); + } + return buf; +} + +static inline void cvm_relu(float *buf, uint64_t size) { + for (uint64_t i = 0; i < size; i++) + if (buf[i] < 0) buf[i] = 0; +} + +static void matrix_mac_ref(param_t *p, uint16_t left[], uint16_t right[], uint32_t bias[], + uint32_t res[]) { + uint64_t size = res_size(p); + uint32_t left_col = p->left->shape.col; + uint32_t right_col = p->right->shape.col; + uint32_t res_row = p->left->shape.n; + uint32_t res_col = p->res->shape.col; + uint32_t left_c = p->left->shape.c; + uint32_t left_w = p->left->shape.w; + + float *tmp_res = new float[size]; + if (p->add_result) { + for (uint32_t i = 0; i < res_row * res_col; i++) tmp_res[i] = convert_bf16_fp32(res[i]); + } else { + for (uint32_t i = 0; i < res_row * res_col; i++) tmp_res[i] = 0; + } + for (uint32_t row = 0; row < res_row; row++) { + for (uint32_t col = 0; col < res_col; col++) { + for (uint32_t wi = 0; wi < left_w; wi++) { + for (uint32_t ci = 0; ci < left_c; ci++) { + if ((wi + (ci * left_w)) >= left_col) continue; + uint32_t li = row * left_col + left_w * ci + wi; + uint32_t ri = (ci * left_w + wi) * right_col + col; + + float l = convert_bf16_fp32(left[li]); + float r = convert_bf16_fp32(right[ri]); + tmp_res[row * res_col + col] += l * r; + } + } + } + } + + if (p->bias) { + for (uint32_t row = 0; row < res_row; row++) { + for (uint32_t col = 0; col < res_col; col++) { + float b = convert_hex_fp32(bias[col]); + tmp_res[row * res_col + col] += b; + } + } + } + + if (p->relu_enable) cvm_relu(tmp_res, size); + + for (uint64_t i = 0; i < size; i++) { + res[i] = convert_fp32_bf16(tmp_res[i]); + } + delete[] tmp_res; +} + +static void put_bias(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx, const cvk_ml_t *ml, + uint32_t data[]) { + uint64_t size = ml->shape.col; + + uint16_t *tmp = new uint16_t[size * 2]; + for (uint64_t i = 0; i < size; i++) { + tmp[i] = (data[i] >> 16) & 0xFFFF; + tmp[i + size] = (data[i] & 0xFFFF); + } + + test_put_matrix_g2l_comp(ctx, bk_ctx, ml, (uint8_t *)tmp); + + delete[] tmp; +} + +static void put_res(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx, const cvk_ml_t *ml, + uint32_t data[]) { + uint64_t size = ml->shape.n * ml->shape.col; + + uint16_t *tmp = new uint16_t[size]; + for (uint64_t i = 0; i < size; i++) { + tmp[i] = (data[i] & 0xFFFF); + } + + test_put_matrix_g2l_comp(ctx, bk_ctx, ml, (uint8_t *)tmp); + + delete[] tmp; +} + +static uint32_t *get_res(CVI_RT_HANDLE *ctx, cvk_mg_t *mg, param_t *p) { + uint64_t size = res_size(p); + uint32_t *res = new uint32_t[size]; + + uint16_t *tmp = (uint16_t *)test_get_mg_mem_comp(ctx, mg); + for (uint64_t i = 0; i < size; i++) res[i] = tmp[i]; + + delete[] tmp; + return res; +} + +static inline cvk_mg_t *put_bf16_matrix_g(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx, + const cvk_ml_t *ml, uint8_t data[], + cvk_fmt_t mg_data_format) { + cvk_mg_shape_t s; + s.row = ml->shape.n; + s.col = ml->shape.col; + cvk_mg_t *mg = test_alloc_mg_mem_comp(ctx, s, mg_data_format); + + test_put_mg_mem_comp(ctx, mg, data); + test_submit_comp(ctx, bk_ctx); + + return mg; +} + +static void test_param(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx, param_t *p) { + uint16_t *left = alloc_left(p); + uint16_t *right = alloc_right(p); + uint32_t *bias = alloc_bias(p); + uint32_t *ref = alloc_res(p); + + cvk_mg_t *left_mg = put_bf16_matrix_g(ctx, bk_ctx, p->left, (uint8_t *)left, CVK_FMT_BF16); + cvk_mg_t *right_mg = put_bf16_matrix_g(ctx, bk_ctx, p->right, (uint8_t *)right, CVK_FMT_BF16); + cvk_mg_shape_t s; + s.row = p->res->shape.n; + s.col = p->res->shape.col; + cvk_mg_t *result_mg = test_alloc_mg_mem_comp(ctx, s, CVK_FMT_BF16); + + if (bias) put_bias(ctx, bk_ctx, p->bias, bias); + if (p->add_result) put_res(ctx, bk_ctx, p->res, ref); + + printf("start\n"); + size_t *slice_num = + cvm_gemm(bk_ctx, left_mg->start_address, right_mg->start_address, result_mg->start_address, + p->left->shape.n, p->left->shape.col, p->res->shape.col, CVK_FMT_BF16); + free(slice_num); // no need use in bf16 + test_submit_comp(ctx, bk_ctx); + + uint32_t *res = get_res(ctx, result_mg, p); + matrix_mac_ref(p, left, right, bias, ref); + + uint64_t size = res_size(p); + for (uint64_t i = 0; i < size; i++) { + if (res[i] != ref[i]) { + uint16_t _res = res[i] & 0xffff; + uint16_t _ref = ref[i] & 0xffff; + fprintf(stderr, "comparing failed at out[%lu], got %f(0x%x), exp %f(0x%x)\n", i, + convert_bf16_fp32(_res), res[i], convert_bf16_fp32(_ref), ref[i]); + fprintf(stderr, "random_seed=%d\n", random_seed); + exit(-1); + } + } + + test_free_mg_mem_comp(ctx, left_mg); + test_free_mg_mem_comp(ctx, right_mg); + test_free_mg_mem_comp(ctx, result_mg); + + delete[] left; + delete[] right; + delete[] bias; + delete[] res; +} + +static void destroy_param(cvk_context_t *bk_ctx, param_t *p) { + if (p->bias) bk_ctx->ops->lmem_free_matrix(bk_ctx, p->bias); + if (p->res) bk_ctx->ops->lmem_free_matrix(bk_ctx, p->res); + if (p->right) bk_ctx->ops->lmem_free_matrix(bk_ctx, p->right); + if (p->left) bk_ctx->ops->lmem_free_matrix(bk_ctx, p->left); +} + +static cvk_ml_t *alloc_param_res(cvk_context_t *bk_ctx, param_t *p) { + cvk_ml_shape_t s; + + s.n = p->left->shape.n; + s.c = p->right->shape.c; + s.w = p->right->shape.w; + s.col = p->right->shape.col; + cvk_fmt_t fmt = CVK_FMT_BF16; + cvk_ml_shape_t fake; + fake.n = 1; + fake.c = 1; + fake.w = 1; + fake.col = 1; + cvk_ml_t *t = bk_ctx->ops->lmem_alloc_matrix(bk_ctx, fake, fmt, 1); + t->shape = s; + return t; +} + +static param_t param_0(cvk_context_t *bk_ctx) { +retry: + random_seed = clock(); + srand(random_seed); + + param_t p; + memset(&p, 0, sizeof(p)); + p.lshift_bits = 0; + p.rshift_bits = 0; + p.res_is_int8 = true; + p.relu_enable = rand() % 2; + p.relu_enable = 0; + p.add_result = 0; /*bf16 HW does not support add_result*/ + p.ps32_mode = 0; + + uint32_t left_row = rand() % 100 + 1; + uint32_t left_col = rand() % 100 + 1; + left_row = 1024; + left_col = 1024; + uint32_t left_w = rand() % (left_col / 5 + 1) + 1; // c is generate by w, and make c is larger + uint32_t left_c = left_col / left_w + (left_col % left_w ? 1 : 0); + + uint32_t right_row = left_col; + uint32_t right_col = rand() % 100 + 1; + right_col = 1024; + uint32_t right_w = (rand() % (right_col / 5 + 1) + 1); // make c is larger + uint32_t right_c = right_col / right_w + (right_col % right_w ? 1 : 0); + + cvk_ml_shape_t left_shape; + left_shape.n = left_row; + left_shape.c = left_c; + left_shape.w = left_w; + left_shape.col = left_col; + + cvk_ml_shape_t right_shape; + right_shape.n = right_row; + right_shape.c = right_c; + right_shape.w = right_w; + right_shape.col = right_col; + + uint32_t bias = rand() % 2; + bias = 0; + p.bias = NULL; + + cvk_ml_shape_t fake; + fake.n = 1; + fake.c = 1; + fake.w = 1; + fake.col = 1; + + cvk_ml_t *t = bk_ctx->ops->lmem_alloc_matrix(bk_ctx, fake, CVK_FMT_BF16, 1); + t->shape = left_shape; + p.left = t; + + t = bk_ctx->ops->lmem_alloc_matrix(bk_ctx, fake, CVK_FMT_BF16, 1); + t->shape = right_shape; + p.right = t; + if (!p.left || !p.right) { + printf("retry init_matrix_param\n"); + destroy_param(bk_ctx, &p); + goto retry; + } + + p.res = alloc_param_res(bk_ctx, &p); + if (bias) { + cvk_ml_shape_t bias_shape = right_shape; + bias_shape.n = 2; + p.bias = bk_ctx->ops->lmem_alloc_matrix(bk_ctx, bias_shape, CVK_FMT_BF16, 1); + } + + if (!p.res || (bias && !p.bias)) { + printf("retry init_matrix_param\n"); + destroy_param(bk_ctx, &p); + goto retry; + } + + return p; +} + +// gemm test function +//#define USE_CBLAS_VERITY (1) + +#ifdef USE_CBLAS_VERITY +#include +#endif /* ifdef USE_CBLAS_VERITY */ + +// comes from +// https://stackoverflow.com/questions/47023651/multiplying-matrices-in-one-dimensional-arrays +void multiply(uint16_t *a, int row1, int col1, uint16_t *b, int row2, int col2, uint16_t *d) { + assert(col1 == row2); + // silence error=unused-but-set-parameter warning + (void)row2; + + for (int i = 0; i < row1; i++) { + for (int j = 0; j < col2; j++) { + float sum = 0; + for (int k = 0; k < col1; k++) { + float _a = convert_bf16_fp32(a[i * col1 + k]); + float _b = convert_bf16_fp32(b[k * col2 + j]); + sum = sum + _a * _b; + } + d[i * col2 + j] = convert_fp32_bf16(sum); + } + } + +#if 0 + for (int i = 0; i < size; i++) { + if (i % col2 == 0) { + printf("\n"); + } + printf("%f ", convert_bf16_fp32(d[i])); + } +#endif +} + +#ifdef USE_CBLAS_VERITY +#else +static void multiply_i32(uint8_t *a, int row1, int col1, uint8_t *b, int row2, int col2, + uint32_t *d, cvk_fmt_t fmt) { + assert(col1 == row2); + // silence error=unused-but-set-parameter warning + (void)row2; + + for (int i = 0; i < row1; i++) { + for (int j = 0; j < col2; j++) { + int sum = 0; + for (int k = 0; k < col1; k++) { + int _a = fmt == CVK_FMT_I8 ? (int8_t)(a[i * col1 + k]) : (a[i * col1 + k]); + int _b = fmt == CVK_FMT_I8 ? (int8_t)(b[k * col2 + j]) : (b[k * col2 + j]); + // printf("sum = sum + _a * _b = %d = %d + %d * %d\n", sum + _a * _b, sum, _a, _b); + sum = sum + _a * _b; + } + // printf("out [%d] is %d\n", i * col2 + j, sum); + d[i * col2 + j] = (sum); + } + } + +#if 0 + for (int i = 0; i < size; i++) { + if (i % col2 == 0) { + printf("\n"); + } + printf("%f ", convert_bf16_fp32(d[i])); + } +#endif +} +#endif /* ifdef USE_CBLAS_VERITY */ + +int array_cmp_int16(const char *const info, const uint16_t *p_exp, const uint16_t *p_got, + int count) { + int idx; + for (idx = 0; idx < count; idx++) { + if (p_exp[idx] != p_got[idx]) { + printf("%s error at index %d exp %d(%f,0x%x) got %d(%f,0x%x)\n", info, idx, p_exp[idx], + convert_bf16_fp32(p_exp[idx]), p_exp[idx], p_got[idx], convert_bf16_fp32(p_got[idx]), + p_got[idx]); + return -1; + } + } + return 0; +} + +int array_cmp_int32(const char *const info, const uint32_t *p_exp, const uint32_t *p_got, + int count) { + int idx; + for (idx = 0; idx < count; idx++) { + if (p_exp[idx] != p_got[idx]) { + printf("%s error at index %d exp %d got %d\n", info, idx, p_exp[idx], p_got[idx]); + return -1; + } + } + return 0; +} + +static void assign_bf16_values_to_matrix(uint16_t *matrix, size_t size) { + float t; + for (size_t i = 0; i < size; i++) { + float f; +#if 1 + if (i % 2 == 0) t = i % 8; + if (i % 2 == 1) t = -1 * (i % 8); + f = t; +#else + t = i * (i % 2 ? -1 : 1); + f = t * 0.01 + size * 0.01; +#endif + matrix[i] = convert_fp32_bf16(f); + // printf("f[%lu] is %f(0x%x)\n", i, f, matrix[i]); + } +} + +static void uint16_to_float(float *float_data, uint16_t *bf16_data, size_t size) { + for (size_t i = 0; i < size; i++) { + float_data[i] = convert_bf16_fp32(bf16_data[i]); + } +} + +static void uint8_to_float(float *float_data, uint8_t *i8_data, size_t size, cvk_fmt_t fmt) { + for (size_t i = 0; i < size; i++) { + int input = (i8_data[i]); + if (fmt == CVK_FMT_I8) { + input = (int8_t)(i8_data[i]); + } + float_data[i] = (float)input; + } +} + +static void assign_i8_values_to_matrix(uint8_t *matrix, size_t size) { + for (size_t i = 0; i < size; i++) { + matrix[i] = i + 20; + } +} + +#ifdef USE_CBLAS_VERITY +static void float_to_int16(uint16_t *int16_data, float *float_data, size_t size) { + for (size_t i = 0; i < size; i++) { + int16_data[i] = convert_fp32_bf16(float_data[i]); + } +} + +static void float_to_int32(uint32_t *int32_data, float *float_data, size_t size) { + for (size_t i = 0; i < size; i++) { + int32_data[i] = (uint32_t)float_data[i]; + } +} +#endif + +// int8 +static int _test_bmblas_gemm_bm1880v2(size_t M, size_t N, size_t K, cvk_fmt_t fmt) { + long elapsed; + struct timeval t0, t1; + int ret = 0; + + uint8_t *i8_A = new uint8_t[M * K]; + uint8_t *i8_B = new uint8_t[N * K]; + uint8_t *i8_C = new uint8_t[4 * M * N]; // 32 bit output + uint32_t *i32bit_ref = new uint32_t[M * N]; + + assign_i8_values_to_matrix(i8_A, M * K); + assign_i8_values_to_matrix(i8_B, N * K); + + float *float_A = new float[M * K]; + float *float_B = new float[N * K]; + float *float_C_ref = new float[M * N]; + uint8_to_float(float_A, i8_A, M * K, fmt); + uint8_to_float(float_B, i8_B, N * K, fmt); + +#if 0 + printf("\nA:"); + for (int i = 0; i < M; i++) { + printf("\n"); + for (int j = 0; j < K; j++) { + printf("%e(0x%x) ", float_A[i * K + j], i8_A[i * K + j]); + } + } + printf("\nB:"); + for (int i = 0; i < K; i++) { + printf("\n"); + for (int j = 0; j < N; j++) { + printf("%e(0x%x) ", float_B[i * N + j], i8_B[i * N + j]); + } + } + printf("\nR:"); + for (int i = 0; i < M; i++) { + printf("\n"); + for (int j = 0; j < N; j++) { + printf("%e ", convert_i8_fp32(i32bit_ref[i * N + j])); + } + } +#endif + gettimeofday(&t0, NULL); + +#ifdef USE_CBLAS_VERITY + float alpha = 0; + float beta = 0; + + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha, float_A, K, float_B, N, + beta, float_C_ref, N); + float_to_int32(i32bit_ref, float_C_ref, M * N); +#else /* ! ifdef USE_CBLAS_VERITY */ + multiply_i32(i8_A, M, K, i8_B, K, N, i32bit_ref, fmt); +#endif /* ifdef USE_CBLAS_VERITY */ + + gettimeofday(&t1, NULL); + elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; +#ifdef USE_CBLAS_VERITY + printf("cblas GEMM takes %ld us\n", elapsed); +#else /* ! ifdef USE_CBLAS_VERITY */ + printf("CPU GEMM takes %ld us\n", elapsed); +#endif /* ifdef USE_CBLAS_VERITY */ + + CVI_RT_HANDLE ctx; + cvk_context_t *bk_ctx; + + test_init(&ctx, &bk_ctx); + + // alloc device memory + cvk_mg_shape_t s_a = {(uint32_t)M, (uint32_t)K}; + cvk_mg_shape_t s_b = {(uint32_t)K, (uint32_t)N}; + cvk_mg_shape_t s_r = {2 * (uint32_t)M, 2 * (uint32_t)N}; + + size_t s_size_a = mg_shape_size(&s_a) * bytesize_of_fmt(fmt); + size_t s_size_b = mg_shape_size(&s_b) * bytesize_of_fmt(fmt); + size_t s_size_r = mg_shape_size(&s_r) * bytesize_of_fmt(fmt); + + CVI_RT_MEM devmem_a = CVI_RT_MemAlloc(ctx, s_size_a); + CVI_RT_MEM devmem_b = CVI_RT_MemAlloc(ctx, s_size_b); + CVI_RT_MEM devmem_r = CVI_RT_MemAlloc(ctx, s_size_r); + + gaddr_t gaddr_a = CVI_RT_MemGetPAddr(devmem_a); + gaddr_t gaddr_b = CVI_RT_MemGetPAddr(devmem_b); + gaddr_t gaddr_r = CVI_RT_MemGetPAddr(devmem_r); + + // copy to device memory + CVI_RT_MemCopyS2D(ctx, devmem_a, (uint8_t *)i8_A); + CVI_RT_MemCopyS2D(ctx, devmem_b, (uint8_t *)i8_B); + CVI_RT_MemCopyS2D(ctx, devmem_r, (uint8_t *)i8_C); + + // do computation with bmkernel + // bmruntime_bmkernel_create(ctx, (void**)&bk_ctx); + + // printf("gaddr_a/gaddr_b/gaddr_r at %zx %zx %zx\n", gaddr_a, gaddr_b, gaddr_r); + size_t *slice_num = cvm_gemm((cvk_context_t *)bk_ctx, gaddr_a, gaddr_b, gaddr_r, M, K, N, fmt); + + gettimeofday(&t0, NULL); + test_submit_comp(&ctx, bk_ctx); + gettimeofday(&t1, NULL); + elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; + printf("TPU GEMM takes %ld us\n", elapsed); + + CVI_RT_MemCopyD2S(ctx, (uint8_t *)i8_C, devmem_r); + + CVI_RT_MemFree(ctx, devmem_a); + CVI_RT_MemFree(ctx, devmem_b); + CVI_RT_MemFree(ctx, devmem_r); + + test_exit(&ctx, bk_ctx); + + uint32_t *i32_C = new uint32_t[M * N]; // 32 bit output with stirded + + cvm_combin_gemm_i8(slice_num, i8_C, i32_C, M, N); + + free(slice_num); + + int cmp_res = array_cmp_int32("gemm", i32bit_ref, i32_C, M * N); + if (cmp_res != 0) { + ret = -1; + printf("Comparison failed for cblas_sgemm and bmblas_gemm!\n"); +#if 0 + printf("\nref/cmd is:"); + for (int i = 0; i < M; i++) { + printf(">\n"); + for (int j = 0; j < N; j++) { + printf("%f(0x%x)/%f(0x%x) ", + convert_i8_fp32(i32bit_ref[i * N + j]), i32bit_ref[i * N + j], + convert_i8_fp32(i8_C[i * N + j]), i8_C[i * N + j] + ); + } + } +#endif + } else { + // printf("Comparison done for cblas_sgemm and bmblas_gemm!\n\n"); + } + + delete[] float_A; + delete[] float_B; + delete[] float_C_ref; + delete[] i8_A; + delete[] i8_B; + delete[] i8_C; + delete[] i32bit_ref; + delete[] i32_C; + return ret; +} + +int test_bmblas_gemm_bm1880v2(size_t M, size_t N, size_t K, cvk_fmt_t fmt) { + printf("%s: M=%zu, N=%zu, K=%zu, fmt_sz: %d\n", __func__, M, N, K, cvm_bytesize_of_fmt(fmt)); + + // FIXME: not duplicate + if (fmt != CVK_FMT_BF16) { + return _test_bmblas_gemm_bm1880v2(M, N, K, fmt); + } + + long elapsed; + struct timeval t0, t1; + int ret = 0; + + uint16_t *bf16_A = new uint16_t[M * K]; + uint16_t *bf16_B = new uint16_t[N * K]; + uint16_t *bf16_C = new uint16_t[2 * M * N]; + uint16_t *int16_C_ref = new uint16_t[M * N]; + + assign_bf16_values_to_matrix(bf16_A, M * K); + assign_bf16_values_to_matrix(bf16_B, N * K); + + float *float_A = new float[M * K]; + float *float_B = new float[N * K]; + float *float_C_ref = new float[M * N]; + uint16_to_float(float_A, bf16_A, M * K); + uint16_to_float(float_B, bf16_B, N * K); + +#if 0 + printf("\nA:"); + for (int i = 0; i < M; i++) { + printf("\n"); + for (int j = 0; j < K; j++) { + printf("%e(0x%x) ", float_A[i * K + j], bf16_A[i * K + j]); + } + } + printf("\nB:"); + for (int i = 0; i < K; i++) { + printf("\n"); + for (int j = 0; j < N; j++) { + printf("%e(0x%x) ", float_B[i * N + j], bf16_B[i * N + j]); + } + } + printf("\nR:"); + for (int i = 0; i < M; i++) { + printf("\n"); + for (int j = 0; j < N; j++) { + printf("%e ", convert_bf16_fp32(int16_C_ref[i * N + j])); + } + } +#endif + gettimeofday(&t0, NULL); + +#ifdef USE_CBLAS_VERITY + float alpha = 0; + float beta = 0; + cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha, float_A, K, float_B, N, + beta, float_C_ref, N); + float_to_int16(int16_C_ref, float_C_ref, M * N); +#else /* ! ifdef USE_CBLAS_VERITY */ + multiply(bf16_A, M, K, bf16_B, K, N, int16_C_ref); +#endif /* ifdef USE_CBLAS_VERITY */ + + delete[] float_A; + delete[] float_B; + delete[] float_C_ref; + + gettimeofday(&t1, NULL); + elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; +#ifdef USE_CBLAS_VERITY + printf("cblas GEMM takes %ld us\n", elapsed); +#else + printf("CPU GEMM takes %ld us\n", elapsed); +#endif + + CVI_RT_HANDLE ctx; + cvk_context_t *bk_ctx; + + test_init(&ctx, &bk_ctx); + + // alloc device memory + cvk_mg_shape_t s_a = {(uint32_t)M, (uint32_t)K}; + cvk_mg_shape_t s_b = {(uint32_t)K, (uint32_t)N}; + cvk_mg_shape_t s_r = {(uint32_t)M, (uint32_t)N}; + + size_t s_size_a = mg_shape_size(&s_a) * bytesize_of_fmt(fmt); + size_t s_size_b = mg_shape_size(&s_b) * bytesize_of_fmt(fmt); + size_t s_size_r = mg_shape_size(&s_r) * bytesize_of_fmt(fmt) * bytesize_of_fmt(fmt); + + CVI_RT_MEM devmem_a = CVI_RT_MemAlloc(ctx, s_size_a); + CVI_RT_MEM devmem_b = CVI_RT_MemAlloc(ctx, s_size_b); + CVI_RT_MEM devmem_r = CVI_RT_MemAlloc(ctx, s_size_r); + + gaddr_t gaddr_a = CVI_RT_MemGetPAddr(devmem_a); + gaddr_t gaddr_b = CVI_RT_MemGetPAddr(devmem_b); + gaddr_t gaddr_r = CVI_RT_MemGetPAddr(devmem_r); + + // copy to device memory + CVI_RT_MemCopyS2D(ctx, devmem_a, (uint8_t *)bf16_A); + CVI_RT_MemCopyS2D(ctx, devmem_b, (uint8_t *)bf16_B); + CVI_RT_MemCopyS2D(ctx, devmem_r, (uint8_t *)bf16_C); + // do computation with bmkernel + // bmruntime_bmkernel_create(ctx, (void**)&bk_ctx); + + size_t *slice_num = + cvm_gemm((cvk_context_t *)bk_ctx, gaddr_a, gaddr_b, gaddr_r, M, K, N, CVK_FMT_BF16); + free(slice_num); // no use slice_num infomation in BF16 + + gettimeofday(&t0, NULL); + test_submit_comp(&ctx, bk_ctx); + gettimeofday(&t1, NULL); + elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec; + printf("TPU GEMM takes %ld us\n", elapsed); + + CVI_RT_MemCopyD2S(ctx, (uint8_t *)bf16_C, devmem_r); + + // bmruntime_bmkernel_destroy(ctx); + + CVI_RT_MemFree(ctx, devmem_a); + CVI_RT_MemFree(ctx, devmem_b); + CVI_RT_MemFree(ctx, devmem_r); + + test_exit(&ctx, bk_ctx); + + int cmp_res = array_cmp_int16("gemm", int16_C_ref, bf16_C, M * N); + if (cmp_res != 0) { + ret = -1; + printf("Comparison failed for cblas_sgemm and bmblas_gemm!\n"); +#if 0 + printf("\nref/cmd is:"); + for (int i = 0; i < M; i++) { + printf(">\n"); + for (int j = 0; j < N; j++) { + printf("%f(0x%x)/%f(0x%x) ", + convert_bf16_fp32(int16_C_ref[i * N + j]), int16_C_ref[i * N + j], + convert_bf16_fp32(bf16_C[i * N + j]), bf16_C[i * N + j] + ); + } + } +#endif + } else { + // printf("Comparison done for cblas_sgemm and bmblas_gemm!\n\n"); + } + + delete[] bf16_A; + delete[] bf16_B; + delete[] bf16_C; + delete[] int16_C_ref; + return ret; +} + +#define test_one_param(n) \ + do { \ + param_t p = param_##n(bk_ctx); \ + test_param(&ctx, bk_ctx, &p); \ + destroy_param(bk_ctx, &p); \ + } while (0) + +int main() { + int round_mode; + round_mode = set_store_feround(); + CVI_RT_HANDLE ctx; + cvk_context_t *bk_ctx; + + test_init(&ctx, &bk_ctx); + + // int8 example + if (0 != test_bmblas_gemm_bm1880v2(1, 100, 512, CVK_FMT_I8)) exit(-1); + if (0 != test_bmblas_gemm_bm1880v2(2, 100, 512, CVK_FMT_I8)) exit(-1); + if (0 != test_bmblas_gemm_bm1880v2(4, 100, 512, CVK_FMT_I8)) exit(-1); + if (0 != test_bmblas_gemm_bm1880v2(8, 100, 512, CVK_FMT_I8)) exit(-1); + + if (0 != test_bmblas_gemm_bm1880v2(1, 20000, 512, CVK_FMT_I8)) exit(-1); + if (0 != test_bmblas_gemm_bm1880v2(10, 200, 10, CVK_FMT_I8)) exit(-1); + if (0 != test_bmblas_gemm_bm1880v2(1, 200, 500, CVK_FMT_I8)) exit(-1); + if (0 != test_bmblas_gemm_bm1880v2(1, 20, 50, CVK_FMT_I8)) exit(-1); + if (0 != test_bmblas_gemm_bm1880v2(2, 10, 100, CVK_FMT_I8)) exit(-1); + if (0 != test_bmblas_gemm_bm1880v2(2, 1000, 5, CVK_FMT_I8)) exit(-1); + if (0 != test_bmblas_gemm_bm1880v2(20, 5, 5, CVK_FMT_I8)) exit(-1); + if (0 != test_bmblas_gemm_bm1880v2(2, 5, 5, CVK_FMT_I8)) exit(-1); + cvk_fmt_t fmts[2] = {CVK_FMT_BF16, CVK_FMT_I8}; + // cvk_fmt_t fmts[1] = {CVK_FMT_BF16}; + int fmts_sz = sizeof(fmts) / sizeof(fmts[0]); + + for (int i = 0; i < fmts_sz; i++) { + cvk_fmt_t fmt = fmts[i]; + if (0) { + // backend implement + for (int i = 0; i < 30; i++) test_one_param(0); + + } else { + // gemm, plz refer bmtap2/libbmblas + int M = 10000; + int N = 10000; + int K = 1024; + M = 2000; + N = 2000; + int m, k, n; + + if (0) { + for (m = 1; m <= M; m *= 10) { + for (n = 1; n <= N; n += 200) { + for (k = 1; k <= K; k *= 2) { + if (0 != test_bmblas_gemm_bm1880v2(m, n, k, fmt)) { + exit(-1); + } + } + } + } + } + + if (1) { + if (0 != test_bmblas_gemm_bm1880v2(1, 500, 512, fmt)) exit(-1); + if (0 != test_bmblas_gemm_bm1880v2(1, 750, 512, fmt)) exit(-1); + if (0 != test_bmblas_gemm_bm1880v2(1, 100, 512, fmt)) exit(-1); + if (0 != test_bmblas_gemm_bm1880v2(2, 100, 512, fmt)) exit(-1); + if (0 != test_bmblas_gemm_bm1880v2(4, 100, 512, fmt)) exit(-1); + if (0 != test_bmblas_gemm_bm1880v2(8, 100, 512, fmt)) exit(-1); + // if (0 != test_bmblas_gemm_bm1880v2(1, 50000, 512, fmt)) exit(-1); + // if (0 != test_bmblas_gemm_bm1880v2(1, 75000, 512, fmt)) exit(-1); + // if (0 != test_bmblas_gemm_bm1880v2(1, 10000, 512, fmt)) exit(-1); + // if (0 != test_bmblas_gemm_bm1880v2(2, 10000, 512, fmt)) exit(-1); + // if (0 != test_bmblas_gemm_bm1880v2(4, 10000, 512, fmt)) exit(-1); + // if (0 != test_bmblas_gemm_bm1880v2(8, 10000, 512, fmt)) exit(-1); + } + + printf("Comparison done for cblas_sgemm and bmblas_gemm!\n\n"); + } + } + + test_exit(&ctx, bk_ctx); + restore_feround(round_mode); + return 0; +} diff --git a/cvimath/tests/cvi1835/mask.cpp b/cvimath/tests/cvi1835/mask.cpp new file mode 100644 index 000000000..98211ef11 --- /dev/null +++ b/cvimath/tests/cvi1835/mask.cpp @@ -0,0 +1,158 @@ +#include +#include + +#define OUT +#define IN +#include +#include +#include +#include +#include +#include +//#define DBG + +using namespace std; + +/** + * pre_data means we test fixed pattern, it should be same sa lut + */ +// enum TEST_MODE { +// CVM_MASK_TYPE_GT_0 = 0, // remain > 0 +// //CVM_MASK_TYPE_GE_0, // remain >= 0 +// //CVM_MASK_TYPE_EQ_0, // remain = 0 +// //CVM_MASK_TYPE_LT_0, // remain < 0 +// //CVM_MASK_TYPE_LE_0, // remain <= 0 +// CVM_MASK_MAX +//}; + +enum CVM_MASK_TYPE mode; + +struct pattern { + float *input; + float *ref; + int len; +}; +#define SIZEOF(x) (sizeof(x) / sizeof(x[0])) +float cvm_mask_type_gt_0_input[] = {-1 * pow(2, -62), -0.003, -1.0, -100000, 0.000001, 1, 1000, + pow(2, 62), 0}; + +float cvm_mask_type_gt_0_output[] = {0, 0, 0, 0, 1, 1, 1, 1, 0}; +float cvm_mask_type_ge_0_output[] = {0, 0, 0, 0, 1, 1, 1, 1, 1}; +float cvm_mask_type_eq_0_output[] = {0, 0, 0, 0, 0, 0, 0, 0, 1}; +float cvm_mask_type_lt_0_output[] = {1, 1, 1, 1, 0, 0, 0, 0, 0}; +float cvm_mask_type_le_0_output[] = {1, 1, 1, 1, 0, 0, 0, 0, 1}; + +int input_sz = sizeof(cvm_mask_type_gt_0_input) / sizeof(cvm_mask_type_gt_0_input[0]); + +static struct pattern patterns[] = { + {cvm_mask_type_gt_0_input, cvm_mask_type_gt_0_output, input_sz}, + {cvm_mask_type_gt_0_input, cvm_mask_type_ge_0_output, input_sz}, + {cvm_mask_type_gt_0_input, cvm_mask_type_eq_0_output, input_sz}, + {cvm_mask_type_gt_0_input, cvm_mask_type_lt_0_output, input_sz}, + {cvm_mask_type_gt_0_input, cvm_mask_type_le_0_output, input_sz}, +}; + +static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) { + cvk_fmt_t fmt = CVK_FMT_BF16; + struct pattern *p = &patterns[mode]; + uint32_t input_n = 1; + uint32_t input_c = 1; + uint32_t input_h = 1; + uint32_t input_w = p->len; + + cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w}; + cvk_tl_shape_t ofmap_shape = ifmap_shape; + uint64_t ifmap_size = tl_shape_size(&ifmap_shape); + uint64_t ofmap_size = tl_shape_size(&ofmap_shape); + + int data_type_size = bytesize_of_fmt(fmt); + uint64_t ifmap_bytesize = ifmap_size * data_type_size; + uint64_t ofmap_bytesize = ofmap_size * data_type_size; + + cvk_tl_shape_t table_shape; + uint64_t table_bytesize = cvm_lut_tbl_bytesize(bmk, &table_shape, fmt); + + cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1); + cvk_tl_t *out = tl_ofmap_bf16; + cvk_tl_t *tl_pos_neg_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_0_idx_table = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + + // temp buf + cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_buf2 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1); + cvk_tl_t *tl_buf4 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1); + + uint16_t *input_data = (uint16_t *)xmalloc(ifmap_bytesize); + uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize); + uint16_t *table_data_atan_pos_neg = (uint16_t *)xmalloc(table_bytesize); + uint16_t *idx_0_table_data = (uint16_t *)xmalloc(table_bytesize); + + cvm_gen_0_tbl(idx_0_table_data, &table_shape); + cvm_pos_neg_tbl(table_data_atan_pos_neg, &table_shape); + + for (uint32_t i = 0; i < ifmap_size; i++) { + input_data[i] = convert_fp32_bf16(p->input[i]); + ref_data[i] = convert_fp32_bf16(p->ref[i]); + } + + test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)input_data); + test_put_tensor_g2l_comp(ctx, bmk, tl_pos_neg_buf, (uint8_t *)table_data_atan_pos_neg); + test_put_tensor_g2l_comp(ctx, bmk, tl_0_idx_table, (uint8_t *)idx_0_table_data); + + cvm_emit_mask(bmk, tl_ifmap, tl_buf, tl_buf2, tl_buf4, tl_pos_neg_buf, tl_0_idx_table, out, fmt, + mode); + + test_submit_comp(ctx, bmk); + + uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, out); + + for (uint32_t i = 0; i < ifmap_size; i++) { + if (ref_data[i] != ofmap_data[i]) { + fprintf(stderr, + "comparing failed at mode %d ofmap_data[%u] got %f(0x%x), ref " + "%f(0x%x)\n", + mode, i, convert_bf16_fp32(ofmap_data[i]), ofmap_data[i], + convert_bf16_fp32(ref_data[i]), ref_data[i]); + exit(-1); + } + } +#if 0 + if (!is_close) { + float input = convert_bf16_fp32(ifmap[i]); + } +#endif + free_tl(bmk, tl_buf4); + free_tl(bmk, tl_buf2); + free_tl(bmk, tl_buf); + free_tl(bmk, tl_0_idx_table); + free_tl(bmk, tl_pos_neg_buf); + free_tl(bmk, tl_ofmap_bf16); + free_tl(bmk, tl_ifmap); + + free(input_data); + free(ref_data); + free(ofmap_data); + free(table_data_atan_pos_neg); + free(idx_0_table_data); +} + +int main() { + CVI_RT_HANDLE ctx; + cvk_context_t *bmk; + int round_mode; + + round_mode = set_store_feround(); + + test_init(&ctx, &bmk); + + for (int i = CVM_MASK_TYPE_GT_0; i < CVM_MASK_MAX; i++) { + mode = static_cast(i); + printf("test mode %d...\n", mode); + testbench(&ctx, bmk); + } + + test_exit(&ctx, bmk); + restore_feround(round_mode); + return 0; +} diff --git a/cvimath/tests/cvi1835/reciprocal.cpp b/cvimath/tests/cvi1835/reciprocal.cpp new file mode 100644 index 000000000..586018dea --- /dev/null +++ b/cvimath/tests/cvi1835/reciprocal.cpp @@ -0,0 +1,376 @@ +/** + */ +#include +#include + +#include +#include +#include +#include +#include +#include +//#define DBG + +using namespace std; + +/** + * pre_data means we test fixed pattern, it should be same sa lut + */ +enum TEST_MODE { + PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare + GEN_POW_20_DATA_MAX_ERROR, // generate 2^-20 ~ 2^20 value that check epsilon + TEST_MODE_MAX, +}; + +static TEST_MODE mode; + +static uint16_t test_pattern[] = { + 0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52, 0x3A6C, 0x3A83, 0x3A90, + 0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF, 0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17, + 0x3B1D, 0x3B24, 0x3B2A, 0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65, + 0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90, 0x3B93, 0x3B97, 0x3B9A, + 0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE, 0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1, + 0x3BC5, 0x3BC8, 0x3BCB, 0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9, + 0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03, 0x3C05, 0x3C06, 0x3C08, + 0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12, 0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C, + 0x3C1D, 0x3C1F, 0x3C21, 0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F, + 0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E, 0x3C40, 0x3C41, 0x3C43, + 0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D, 0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57, + 0x3C58, 0x3C5A, 0x3C5C, 0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A, + 0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79, 0x3C7B, 0x3C7C, 0x3C7E, + 0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84, 0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89, + 0x3C8A, 0x3C8A, 0x3C8B, 0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93, + 0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A, 0x3C9B, 0x3C9C, 0x3C9C, + 0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1, 0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6, + 0x3CA7, 0x3CA8, 0x3CA9, 0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0, + 0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8, 0x3CB8, 0x3CB9, 0x3CBA, + 0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF, 0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4, + 0x3CC5, 0x3CC5, 0x3CC6, 0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE, + 0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5, 0x3CD6, 0x3CD7, 0x3CD7, + 0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC, 0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1, + 0x3CE2, 0x3CE3, 0x3CE4, 0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB, + 0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2, 0x3CF3, 0x3CF4, 0x3CF5, + 0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA, 0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF, + 0x3D00, 0x3D00, 0x3D01, 0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04, + 0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08, 0x3D08, 0x3D09, 0x3D09, + 0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C, 0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E, + 0x3D0F, 0x3D0F, 0x3D0F, 0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13, + 0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17, 0x3D17, 0x3D18, 0x3D18, + 0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A, 0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D, + 0x3D1D, 0x3D1E, 0x3D1E, 0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22, + 0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25, 0x3D26, 0x3D26, 0x3D27, + 0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29, 0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C, + 0x3D2C, 0x3D2C, 0x3D2D, 0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31, + 0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34, 0x3D35, 0x3D35, 0x3D35, + 0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38, 0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A, + 0x3D3B, 0x3D3B, 0x3D3C, 0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F, + 0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43, 0x3D43, 0x3D44, 0x3D44, + 0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47, 0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49, + 0x3D4A, 0x3D4A, 0x3D4A, 0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E, + 0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52, 0x3D52, 0x3D53, 0x3D53, + 0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55, 0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58, + 0x3D58, 0x3D59, 0x3D59, 0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D, + 0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60, 0x3D61, 0x3D61, 0x3D62, + 0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64, 0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67, + 0x3D67, 0x3D67, 0x3D68, 0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C, + 0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F, 0x3D70, 0x3D70, 0x3D70, + 0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73, 0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75, + 0x3D76, 0x3D76, 0x3D77, 0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A, + 0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E, 0x3D7E, 0x3D7F, 0x3D7F, + 0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82, + 0x3D82, 0x3D82, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85, + 0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D87, 0x3D87, 0x3D87, + 0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89, + 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C, + 0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, + 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91, + 0x3D91, 0x3D91, 0x3D91, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93, + 0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95, 0x3D95, 0x3D96, 0x3D96, + 0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98, + 0x3D98, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B, + 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9D, 0x3D9D, 0x3D9D, + 0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0, + 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2, + 0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, + 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7, + 0x3DA7, 0x3DA7, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, + 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAC, 0x3DAC, + 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, + 0x3DAE, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1, + 0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3, + 0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6, + 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8, + 0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBB, + 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD, + 0x3DBD, 0x3DBD, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, + 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC2, 0x3DC2, 0x3DC2, + 0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, + 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7, + 0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9, 0x3DC9, 0x3DC9, 0x3DC9, + 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC, + 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5, + 0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE, 0x3DDF, 0x3DE0, 0x3DE1, + 0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5, +}; + +static uint16_t test_pattern_ref[] = { + 0x7f7f, 0x461c, 0x459c, 0x4551, 0x451c, 0x44fa, 0x44d1, 0x44b2, 0x449c, 0x448b, 0x447a, 0x4464, + 0x4451, 0x4441, 0x4432, 0x4426, 0x441c, 0x4413, 0x440b, 0x4404, 0x43fa, 0x43ed, 0x43e4, 0x43d9, + 0x43d1, 0x43c8, 0x43c1, 0x43b9, 0x43b2, 0x43ac, 0x43a6, 0x43a1, 0x439c, 0x4398, 0x4393, 0x438f, + 0x438b, 0x4387, 0x4384, 0x4380, 0x437a, 0x4375, 0x436d, 0x4368, 0x4364, 0x435f, 0x4359, 0x4355, + 0x4351, 0x434c, 0x4348, 0x4344, 0x4341, 0x433c, 0x4339, 0x4336, 0x4332, 0x432f, 0x432c, 0x432a, + 0x4326, 0x4324, 0x4321, 0x431f, 0x431c, 0x431a, 0x4318, 0x4315, 0x4313, 0x4311, 0x430f, 0x430d, + 0x430b, 0x4309, 0x4307, 0x4305, 0x4304, 0x4302, 0x4300, 0x42fe, 0x42fa, 0x42f6, 0x42f5, 0x42f1, + 0x42ed, 0x42ec, 0x42e8, 0x42e5, 0x42e4, 0x42e0, 0x42df, 0x42dc, 0x42d9, 0x42d8, 0x42d5, 0x42d2, + 0x42d1, 0x42ce, 0x42cc, 0x42ca, 0x42c8, 0x42c7, 0x42c4, 0x42c2, 0x42c1, 0x42bf, 0x42bc, 0x42bb, + 0x42b9, 0x42b7, 0x42b6, 0x42b4, 0x42b2, 0x42b1, 0x42af, 0x42ae, 0x42ac, 0x42ab, 0x42aa, 0x42a8, + 0x42a6, 0x42a5, 0x42a4, 0x42a2, 0x42a1, 0x42a0, 0x429f, 0x429e, 0x429c, 0x429b, 0x429a, 0x4298, + 0x4298, 0x4296, 0x4295, 0x4294, 0x4293, 0x4292, 0x4291, 0x4290, 0x428f, 0x428e, 0x428d, 0x428c, + 0x428b, 0x428a, 0x4289, 0x4288, 0x4287, 0x4286, 0x4285, 0x4285, 0x4284, 0x4283, 0x4282, 0x4281, + 0x4280, 0x427e, 0x427e, 0x427c, 0x427a, 0x4278, 0x4276, 0x4275, 0x4275, 0x4273, 0x4271, 0x426f, + 0x426d, 0x426d, 0x426c, 0x426a, 0x4268, 0x4267, 0x4265, 0x4265, 0x4264, 0x4262, 0x4260, 0x425f, + 0x425f, 0x425d, 0x425c, 0x425a, 0x4259, 0x4258, 0x4258, 0x4256, 0x4255, 0x4253, 0x4252, 0x4252, + 0x4251, 0x424f, 0x424e, 0x424d, 0x424c, 0x424c, 0x424a, 0x4249, 0x4248, 0x4247, 0x4247, 0x4245, + 0x4244, 0x4243, 0x4242, 0x4241, 0x4241, 0x4240, 0x423f, 0x423d, 0x423c, 0x423c, 0x423b, 0x423a, + 0x4239, 0x4238, 0x4237, 0x4237, 0x4236, 0x4235, 0x4234, 0x4233, 0x4232, 0x4232, 0x4231, 0x4230, + 0x422f, 0x422e, 0x422e, 0x422d, 0x422c, 0x422c, 0x422b, 0x422a, 0x422a, 0x4229, 0x4228, 0x4227, + 0x4226, 0x4226, 0x4225, 0x4225, 0x4224, 0x4223, 0x4222, 0x4222, 0x4221, 0x4221, 0x4220, 0x421f, + 0x421f, 0x421e, 0x421e, 0x421d, 0x421c, 0x421b, 0x421b, 0x421b, 0x421a, 0x4219, 0x4218, 0x4218, + 0x4218, 0x4217, 0x4216, 0x4216, 0x4215, 0x4215, 0x4214, 0x4214, 0x4213, 0x4212, 0x4212, 0x4212, + 0x4211, 0x4210, 0x4210, 0x420f, 0x420f, 0x420e, 0x420e, 0x420d, 0x420d, 0x420d, 0x420c, 0x420b, + 0x420b, 0x420a, 0x420a, 0x420a, 0x4209, 0x4209, 0x4208, 0x4207, 0x4207, 0x4207, 0x4206, 0x4206, + 0x4205, 0x4205, 0x4205, 0x4204, 0x4204, 0x4203, 0x4203, 0x4203, 0x4202, 0x4202, 0x4201, 0x4201, + 0x4200, 0x4200, 0x41fe, 0x41fe, 0x41fe, 0x41fc, 0x41fc, 0x41fa, 0x41fa, 0x41fa, 0x41f8, 0x41f8, + 0x41f6, 0x41f6, 0x41f5, 0x41f5, 0x41f5, 0x41f3, 0x41f3, 0x41f1, 0x41f1, 0x41f1, 0x41ef, 0x41ef, + 0x41ed, 0x41ed, 0x41ed, 0x41ec, 0x41ec, 0x41ea, 0x41ea, 0x41ea, 0x41e8, 0x41e8, 0x41e7, 0x41e7, + 0x41e5, 0x41e5, 0x41e5, 0x41e4, 0x41e4, 0x41e2, 0x41e2, 0x41e2, 0x41e0, 0x41e0, 0x41df, 0x41df, + 0x41df, 0x41dd, 0x41dd, 0x41dc, 0x41dc, 0x41da, 0x41da, 0x41da, 0x41d9, 0x41d9, 0x41d8, 0x41d8, + 0x41d8, 0x41d6, 0x41d6, 0x41d5, 0x41d5, 0x41d5, 0x41d3, 0x41d3, 0x41d2, 0x41d2, 0x41d2, 0x41d1, + 0x41d1, 0x41cf, 0x41cf, 0x41ce, 0x41ce, 0x41ce, 0x41cd, 0x41cd, 0x41cc, 0x41cc, 0x41cc, 0x41ca, + 0x41ca, 0x41c9, 0x41c9, 0x41c9, 0x41c8, 0x41c8, 0x41c7, 0x41c7, 0x41c7, 0x41c5, 0x41c5, 0x41c4, + 0x41c4, 0x41c3, 0x41c3, 0x41c3, 0x41c2, 0x41c2, 0x41c1, 0x41c1, 0x41c1, 0x41c0, 0x41c0, 0x41bf, + 0x41bf, 0x41bf, 0x41bd, 0x41bd, 0x41bc, 0x41bc, 0x41bc, 0x41bb, 0x41bb, 0x41ba, 0x41ba, 0x41b9, + 0x41b9, 0x41b9, 0x41b8, 0x41b8, 0x41b7, 0x41b7, 0x41b7, 0x41b6, 0x41b6, 0x41b5, 0x41b5, 0x41b5, + 0x41b4, 0x41b4, 0x41b3, 0x41b3, 0x41b2, 0x41b2, 0x41b2, 0x41b1, 0x41b1, 0x41b0, 0x41b0, 0x41b0, + 0x41af, 0x41af, 0x41ae, 0x41ae, 0x41ae, 0x41ad, 0x41ad, 0x41ac, 0x41ac, 0x41ac, 0x41ac, 0x41ac, + 0x41ab, 0x41ab, 0x41aa, 0x41aa, 0x41aa, 0x41a9, 0x41a9, 0x41a8, 0x41a8, 0x41a8, 0x41a7, 0x41a7, + 0x41a6, 0x41a6, 0x41a6, 0x41a5, 0x41a5, 0x41a5, 0x41a5, 0x41a5, 0x41a4, 0x41a4, 0x41a3, 0x41a3, + 0x41a2, 0x41a2, 0x41a2, 0x41a1, 0x41a1, 0x41a1, 0x41a1, 0x41a1, 0x41a0, 0x41a0, 0x419f, 0x419f, + 0x419f, 0x419e, 0x419e, 0x419e, 0x419e, 0x419e, 0x419d, 0x419d, 0x419c, 0x419c, 0x419b, 0x419b, + 0x419b, 0x419b, 0x419b, 0x419a, 0x419a, 0x419a, 0x4199, 0x4199, 0x4198, 0x4198, 0x4198, 0x4198, + 0x4198, 0x4197, 0x4197, 0x4197, 0x4196, 0x4196, 0x4196, 0x4196, 0x4195, 0x4195, 0x4195, 0x4194, + 0x4194, 0x4194, 0x4194, 0x4194, 0x4193, 0x4193, 0x4192, 0x4192, 0x4192, 0x4192, 0x4192, 0x4191, + 0x4191, 0x4190, 0x4190, 0x4190, 0x4190, 0x4190, 0x418f, 0x418f, 0x418f, 0x418e, 0x418e, 0x418e, + 0x418e, 0x418e, 0x418d, 0x418d, 0x418d, 0x418d, 0x418d, 0x418c, 0x418c, 0x418b, 0x418b, 0x418b, + 0x418b, 0x418b, 0x418a, 0x418a, 0x418a, 0x418a, 0x418a, 0x4189, 0x4189, 0x4189, 0x4189, 0x4189, + 0x4188, 0x4188, 0x4187, 0x4187, 0x4187, 0x4187, 0x4187, 0x4186, 0x4186, 0x4186, 0x4186, 0x4186, + 0x4185, 0x4185, 0x4185, 0x4185, 0x4185, 0x4184, 0x4184, 0x4184, 0x4184, 0x4184, 0x4183, 0x4183, + 0x4183, 0x4183, 0x4183, 0x4182, 0x4182, 0x4182, 0x4182, 0x4181, 0x4181, 0x4181, 0x4181, 0x4181, + 0x4180, 0x4180, 0x4180, 0x4180, 0x417e, 0x417e, 0x417e, 0x417e, 0x417e, 0x417c, 0x417c, 0x417c, + 0x417c, 0x417c, 0x417a, 0x417a, 0x417a, 0x417a, 0x417a, 0x4178, 0x4178, 0x4178, 0x4178, 0x4176, + 0x4176, 0x4176, 0x4176, 0x4176, 0x4175, 0x4175, 0x4175, 0x4175, 0x4175, 0x4173, 0x4173, 0x4173, + 0x4173, 0x4173, 0x4171, 0x4171, 0x4171, 0x4171, 0x4171, 0x416f, 0x416f, 0x416f, 0x416f, 0x416f, + 0x416d, 0x416d, 0x416d, 0x416d, 0x416d, 0x416c, 0x416c, 0x416c, 0x416c, 0x416c, 0x416a, 0x416a, + 0x416a, 0x416a, 0x416a, 0x4168, 0x4168, 0x4168, 0x4168, 0x4167, 0x4167, 0x4167, 0x4167, 0x4167, + 0x4165, 0x4165, 0x4165, 0x4165, 0x4165, 0x4164, 0x4164, 0x4164, 0x4164, 0x4164, 0x4162, 0x4162, + 0x4162, 0x4162, 0x4162, 0x4160, 0x4160, 0x4160, 0x4160, 0x4160, 0x415f, 0x415f, 0x415f, 0x415f, + 0x415f, 0x415d, 0x415d, 0x415d, 0x415d, 0x415d, 0x415c, 0x415c, 0x415c, 0x415c, 0x415a, 0x415a, + 0x415a, 0x415a, 0x415a, 0x4159, 0x4159, 0x4159, 0x4159, 0x4159, 0x4158, 0x4158, 0x4158, 0x4158, + 0x4158, 0x4156, 0x4156, 0x4156, 0x4156, 0x4156, 0x4155, 0x4155, 0x4155, 0x4155, 0x4155, 0x4153, + 0x4153, 0x4153, 0x4153, 0x4153, 0x4152, 0x4152, 0x4152, 0x4152, 0x4152, 0x4151, 0x4151, 0x4151, + 0x4151, 0x4151, 0x414f, 0x414f, 0x414f, 0x414f, 0x414e, 0x414e, 0x414e, 0x414e, 0x414e, 0x414d, + 0x414d, 0x414d, 0x414d, 0x414d, 0x414c, 0x414c, 0x414c, 0x414c, 0x414c, 0x414a, 0x414a, 0x414a, + 0x414a, 0x414a, 0x4149, 0x4149, 0x4149, 0x4149, 0x4149, 0x4148, 0x4148, 0x4148, 0x4148, 0x4148, + 0x4147, 0x4147, 0x4147, 0x4147, 0x4147, 0x4145, 0x4145, 0x4145, 0x4145, 0x4144, 0x4144, 0x4144, + 0x4144, 0x4144, 0x4143, 0x4143, 0x4143, 0x4143, 0x4143, 0x4142, 0x4142, 0x4142, 0x4142, 0x4142, + 0x4141, 0x4141, 0x4141, 0x4141, 0x4141, 0x4140, 0x4140, 0x4140, 0x4140, 0x4140, 0x413f, 0x413f, + 0x413f, 0x413f, 0x413f, 0x413d, 0x413d, 0x413d, 0x413d, 0x413d, 0x413c, 0x413c, 0x413c, 0x413c, + 0x413c, 0x413b, 0x413b, 0x413b, 0x413b, 0x413a, 0x413a, 0x413a, 0x413a, 0x413a, 0x4139, 0x4139, + 0x4139, 0x4139, 0x4139, 0x4138, 0x4138, 0x4138, 0x4138, 0x4138, 0x4137, 0x4137, 0x4137, 0x4137, + 0x4137, 0x4136, 0x4136, 0x4136, 0x4136, 0x4136, 0x4135, 0x4135, 0x4135, 0x4135, 0x4135, 0x4134, + 0x4134, 0x4134, 0x4134, 0x4134, 0x4133, 0x4133, 0x4133, 0x4133, 0x4132, 0x4132, 0x4132, 0x4132, + 0x4132, 0x4131, 0x4131, 0x4131, 0x4131, 0x4131, 0x4130, 0x4130, 0x4130, 0x4130, 0x4130, 0x412f, + 0x412f, 0x412f, 0x412f, 0x412f, 0x412e, 0x412e, 0x412e, 0x412e, 0x412e, 0x412d, 0x412d, 0x412d, + 0x412d, 0x412d, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, + 0x412b, 0x412b, 0x412b, 0x412b, 0x412a, 0x412a, 0x412a, 0x412a, 0x412a, 0x4129, 0x4129, 0x4129, + 0x4129, 0x4129, 0x4128, 0x4128, 0x4128, 0x4128, 0x4128, 0x4127, 0x4127, 0x4127, 0x4127, 0x4127, + 0x4126, 0x4126, 0x4126, 0x4126, 0x4126, 0x4125, 0x4125, 0x4125, 0x4125, 0x4125, 0x4125, 0x4125, + 0x4125, 0x4125, 0x4125, 0x4124, 0x4124, 0x4124, 0x4124, 0x4124, 0x4123, 0x4123, 0x4123, 0x4123, + 0x4122, 0x4122, 0x4122, 0x4122, 0x4122, 0x4121, 0x4121, 0x4121, 0x4121, 0x4121, 0x4121, 0x4121, + 0x4121, 0x4121, 0x4121, 0x4120, 0x411f, 0x411e, 0x411e, 0x411d, 0x411c, 0x411b, 0x411b, 0x411a, + 0x4119, 0x4118, 0x4118, 0x4117, 0x4116, 0x4116, 0x4115, 0x4114, 0x4114, 0x4113, 0x4112, 0x4112, + 0x4111, 0x4110, 0x4110, 0x410f, +}; + +static void tl_lut_ref(uint16_t *ofmap, uint16_t *ifmap, cvk_tl_shape_t ifmap_shape) { + for (uint32_t i = 0; i < tl_shape_size(&ifmap_shape); i++) { + if (mode == PRE_DATA_COMPARE_FIX) { + ofmap[i] = test_pattern_ref[i]; + } else { + uint16_t v = convert_fp32_bf16(1 / (1.0 * (convert_bf16_fp32(ifmap[i])))); + ofmap[i] = v; + } + } +} + +static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint16_t *ifmap, + uint64_t ifmap_shape_size, TEST_MODE mode) { + uint64_t size = ifmap_shape_size; + + for (uint64_t i = 0; i < size; i++) { + bool is_close; + uint16_t ref; + uint16_t ofmap_data_bf16; + float ref_f; + float ofmap_data_f; + + ref = ref_data[i]; + ref_f = convert_bf16_fp32(ref); + ofmap_data_f = convert_bf16_fp32(ofmap_data[i]); + ofmap_data_bf16 = ofmap_data[i]; + + if (mode == PRE_DATA_COMPARE_FIX) { + is_close = ofmap_data[i] == ref; + } else { + is_close = fabs(ref_f - ofmap_data_f) < 0.001; + } + + if (!is_close) { + fprintf(stderr, + "comparing failed at ofmap_data[%lu](input:%e), got %x, exp %x, " + "fp32: got %e exp %e\n", + i, convert_bf16_fp32(ifmap[i]), ofmap_data_bf16, ref, ofmap_data_f, ref_f); + exit(-1); + } + } + + return true; +} + +static void gen_input(uint16_t *ifmap, uint64_t ifmap_shape_size) { + if (mode == PRE_DATA_COMPARE_FIX) { + memcpy(ifmap, &test_pattern, sizeof(test_pattern)); + } else { + for (uint64_t i = 0; i < ifmap_shape_size; i++) { + srand(static_cast(time(0))); + std::random_device rd; + std::mt19937 e2(rd()); + float LO = pow(2, -10); + float HI = pow(2, 10); + // std::uniform_real_distribution<> dist(pow(2,-62), pow(2,63)); + for (uint64_t i = 0; i < ifmap_shape_size; i++) { + // float r3 = dist(e2); + float r3 = LO + static_cast(rand()) / (static_cast(RAND_MAX / (HI - LO))); + ifmap[i] = convert_fp32_bf16(r3); + } + } + } + +#ifdef DBG + for (uint64_t i = 0; i < ifmap_shape_size; i++) { + printf("source if[%lu] bf16 %f 0x%x, log2f is %f\n", i, convert_bf16_fp32(ifmap[i]), ifmap[i], + floor(log2((convert_bf16_fp32(ifmap[i]))))); + } +#endif /* ifdef DBG */ +} + +static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk, uint32_t input_n, uint32_t input_c, + uint32_t input_h, uint32_t input_w) { + cvk_fmt_t fmt = CVK_FMT_BF16; + + // TODO: check more shape / align + cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w}; + cvk_tl_shape_t ofmap_shape = ifmap_shape; + cvk_tl_shape_t table_shape; + cvm_table_shape(bmk, &table_shape); + + uint64_t ifmap_shape_size = tl_shape_size(&ifmap_shape); + uint64_t ofmap_size = tl_shape_size(&ofmap_shape); + uint64_t table_size = tl_shape_size(&table_shape); + + // prepare input data with size + int data_type_size = bytesize_of_fmt(fmt); + uint64_t ifmap_bytesize = ifmap_shape_size * data_type_size; + uint64_t ofmap_bytesize = ofmap_size * data_type_size; + uint64_t table_bytesize = table_size * data_type_size; + + uint16_t *ifmap = (uint16_t *)xmalloc(ifmap_bytesize); + uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize); + uint16_t *table_data = (uint16_t *)xmalloc(table_bytesize); + uint16_t *table_data_mantissa = (uint16_t *)xmalloc(table_bytesize); + + // alloc lmem + cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_buf = test_alloc_tl(bmk, tl_ifmap->shape, fmt, /*align*/ 1); + cvk_tl_t *cvk_tl_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *cvk_tl_table_answer_mantissa = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + + // generate testbench + gen_input(ifmap, ifmap_shape_size); + tl_lut_ref(ref_data, ifmap, ifmap_shape); + + // prepare table + cvm_reciprocal_tbl(table_data, table_data_mantissa, &table_shape); + + // sys->lmem + test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)ifmap); + test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer, (uint8_t *)table_data); + test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer_mantissa, (uint8_t *)table_data_mantissa); + + cvm_emit_reciprocal(bmk, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_mantissa, + tl_ofmap_bf16); + + // issue cmd + test_submit_comp(ctx, bmk); + + // get output from lmem->sys + uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, tl_ofmap_bf16); + + verify(ofmap_data, ref_data, ifmap, ifmap_shape_size, mode); + + free_tl(bmk, cvk_tl_table_answer_mantissa); + free_tl(bmk, cvk_tl_table_answer); + free_tl(bmk, tl_buf); + free_tl(bmk, tl_ofmap_bf16); + free_tl(bmk, tl_ifmap); + + free(ifmap); + free(ref_data); + free(ofmap_data); + free(table_data); + free(table_data_mantissa); +} + +int main() { + CVI_RT_HANDLE ctx; + cvk_context_t *bmk; + int round_mode; + + round_mode = set_store_feround(); + + test_init(&ctx, &bmk); + + for (int i = GEN_POW_20_DATA_MAX_ERROR; i < TEST_MODE_MAX; i++) { + mode = static_cast(i); + printf("test mode %d...\n", mode); + + int input_n = 1; + int input_c = 32; + int input_h = 1; + int input_w = 1; + + if (mode == PRE_DATA_COMPARE_FIX) { + input_h = 4; + input_w = 8; + } else { + input_h = input_w = 16; + } + + testbench(&ctx, bmk, input_n, input_c, input_h, input_w); + } + + test_exit(&ctx, bmk); + restore_feround(round_mode); + return 0; +} diff --git a/cvimath/tests/cvi1835/sigmoid_linear_interp.cpp b/cvimath/tests/cvi1835/sigmoid_linear_interp.cpp new file mode 100644 index 000000000..1cb2fae38 --- /dev/null +++ b/cvimath/tests/cvi1835/sigmoid_linear_interp.cpp @@ -0,0 +1,907 @@ +//* TODO: you could rerange any value to -127~127 +#include +#include + +#define OUT +#define IN +//#define DBG + +/** + * pre_data means we test fixed pattern, it should be same sa lut + * compare fix means we MAKE SURE output values equal with golden, + * comment it for check with error using `MAX_ERROR` + */ +enum TEST_MODE { + PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare + PRE_DATA_MAX_ERROR, // pre-data + compare only diff < MAX_ERROR + GEN_DATA_MAX_ERROR, // gen data + compare only diff < MAX_ERROR + TEST_MODE_MAX, +}; + +static TEST_MODE mode; +#define MAX_ERROR (0.004) + +using namespace std; +static uint16_t test_pattern[] = { + 0x0000, 0x3C03, 0x3C83, 0x3CC5, 0x3D03, 0x3D24, 0x3D45, 0x3D65, 0x3D83, 0x3D93, 0x3DA4, 0x3DB4, + 0x3DC5, 0x3DD5, 0x3DE5, 0x3DF6, 0x3E03, 0x3E0B, 0x3E13, 0x3E1C, 0x3E24, 0x3E2C, 0x3E34, 0x3E3C, + 0x3E45, 0x3E4D, 0x3E55, 0x3E5D, 0x3E65, 0x3E6E, 0x3E76, 0x3E7E, 0x3E83, 0x3E87, 0x3E8B, 0x3E8F, + 0x3E93, 0x3E98, 0x3E9C, 0x3EA0, 0x3EA4, 0x3EA8, 0x3EAC, 0x3EB0, 0x3EB4, 0x3EB8, 0x3EBC, 0x3EC1, + 0x3EC5, 0x3EC9, 0x3ECD, 0x3ED1, 0x3ED5, 0x3ED9, 0x3EDD, 0x3EE1, 0x3EE5, 0x3EE9, 0x3EEE, 0x3EF2, + 0x3EF6, 0x3EFA, 0x3EFE, 0x3F01, 0x3F03, 0x3F05, 0x3F07, 0x3F09, 0x3F0B, 0x3F0D, 0x3F0F, 0x3F11, + 0x3F13, 0x3F16, 0x3F18, 0x3F1A, 0x3F1C, 0x3F1E, 0x3F20, 0x3F22, 0x3F24, 0x3F26, 0x3F28, 0x3F2A, + 0x3F2C, 0x3F2E, 0x3F30, 0x3F32, 0x3F34, 0x3F36, 0x3F38, 0x3F3A, 0x3F3C, 0x3F3E, 0x3F41, 0x3F43, + 0x3F45, 0x3F47, 0x3F49, 0x3F4B, 0x3F4D, 0x3F4F, 0x3F51, 0x3F53, 0x3F55, 0x3F57, 0x3F59, 0x3F5B, + 0x3F5D, 0x3F5F, 0x3F61, 0x3F63, 0x3F65, 0x3F67, 0x3F69, 0x3F6C, 0x3F6E, 0x3F70, 0x3F72, 0x3F74, + 0x3F76, 0x3F78, 0x3F7A, 0x3F7C, 0x3F7E, 0x3F80, 0x3F81, 0x3F82, 0x3F83, 0x3F84, 0x3F85, 0x3F86, + 0x3F87, 0x3F88, 0x3F89, 0x3F8A, 0x3F8B, 0x3F8C, 0x3F8D, 0x3F8E, 0x3F8F, 0x3F90, 0x3F91, 0x3F92, + 0x3F93, 0x3F94, 0x3F96, 0x3F97, 0x3F98, 0x3F99, 0x3F9A, 0x3F9B, 0x3F9C, 0x3F9D, 0x3F9E, 0x3F9F, + 0x3FA0, 0x3FA1, 0x3FA2, 0x3FA3, 0x3FA4, 0x3FA5, 0x3FA6, 0x3FA7, 0x3FA8, 0x3FA9, 0x3FAA, 0x3FAB, + 0x3FAC, 0x3FAD, 0x3FAE, 0x3FAF, 0x3FB0, 0x3FB1, 0x3FB2, 0x3FB3, 0x3FB4, 0x3FB5, 0x3FB6, 0x3FB7, + 0x3FB8, 0x3FB9, 0x3FBA, 0x3FBB, 0x3FBC, 0x3FBD, 0x3FBE, 0x3FBF, 0x3FC1, 0x3FC2, 0x3FC3, 0x3FC4, + 0x3FC5, 0x3FC6, 0x3FC7, 0x3FC8, 0x3FC9, 0x3FCA, 0x3FCB, 0x3FCC, 0x3FCD, 0x3FCE, 0x3FCF, 0x3FD0, + 0x3FD1, 0x3FD2, 0x3FD3, 0x3FD4, 0x3FD5, 0x3FD6, 0x3FD7, 0x3FD8, 0x3FD9, 0x3FDA, 0x3FDB, 0x3FDC, + 0x3FDD, 0x3FDE, 0x3FDF, 0x3FE0, 0x3FE1, 0x3FE2, 0x3FE3, 0x3FE4, 0x3FE5, 0x3FE6, 0x3FE7, 0x3FE8, + 0x3FE9, 0x3FEA, 0x3FEC, 0x3FED, 0x3FEE, 0x3FEF, 0x3FF0, 0x3FF1, 0x3FF2, 0x3FF3, 0x3FF4, 0x3FF5, + 0x3FF6, 0x3FF7, 0x3FF8, 0x3FF9, 0x3FFA, 0x3FFB, 0x3FFC, 0x3FFD, 0x3FFE, 0x3FFF, 0x4000, 0x4001, + 0x4001, 0x4002, 0x4002, 0x4003, 0x4003, 0x4004, 0x4004, 0x4005, 0x4005, 0x4006, 0x4006, 0x4007, + 0x4007, 0x4008, 0x4008, 0x4009, 0x4009, 0x400A, 0x400A, 0x400B, 0x400B, 0x400C, 0x400C, 0x400D, + 0x400D, 0x400E, 0x400E, 0x400F, 0x400F, 0x4010, 0x4010, 0x4011, 0x4011, 0x4012, 0x4012, 0x4013, + 0x4013, 0x4014, 0x4014, 0x4015, 0x4016, 0x4016, 0x4017, 0x4017, 0x4018, 0x4018, 0x4019, 0x4019, + 0x401A, 0x401A, 0x401B, 0x401B, 0x401C, 0x401C, 0x401D, 0x401D, 0x401E, 0x401E, 0x401F, 0x401F, + 0x4020, 0x4020, 0x4021, 0x4021, 0x4022, 0x4022, 0x4023, 0x4023, 0x4024, 0x4024, 0x4025, 0x4025, + 0x4026, 0x4026, 0x4027, 0x4027, 0x4028, 0x4028, 0x4029, 0x4029, 0x402A, 0x402A, 0x402B, 0x402C, + 0x402C, 0x402D, 0x402D, 0x402E, 0x402E, 0x402F, 0x402F, 0x4030, 0x4030, 0x4031, 0x4031, 0x4032, + 0x4032, 0x4033, 0x4033, 0x4034, 0x4034, 0x4035, 0x4035, 0x4036, 0x4036, 0x4037, 0x4037, 0x4038, + 0x4038, 0x4039, 0x4039, 0x403A, 0x403A, 0x403B, 0x403B, 0x403C, 0x403C, 0x403D, 0x403D, 0x403E, + 0x403E, 0x403F, 0x403F, 0x4040, 0x4041, 0x4041, 0x4042, 0x4042, 0x4043, 0x4043, 0x4044, 0x4044, + 0x4045, 0x4045, 0x4046, 0x4046, 0x4047, 0x4047, 0x4048, 0x4048, 0x4049, 0x4049, 0x404A, 0x404A, + 0x404B, 0x404B, 0x404C, 0x404C, 0x404D, 0x404D, 0x404E, 0x404E, 0x404F, 0x404F, 0x4050, 0x4050, + 0x4051, 0x4051, 0x4052, 0x4052, 0x4053, 0x4053, 0x4054, 0x4054, 0x4055, 0x4056, 0x4056, 0x4057, + 0x4057, 0x4058, 0x4058, 0x4059, 0x4059, 0x405A, 0x405A, 0x405B, 0x405B, 0x405C, 0x405C, 0x405D, + 0x405D, 0x405E, 0x405E, 0x405F, 0x405F, 0x4060, 0x4060, 0x4061, 0x4061, 0x4062, 0x4062, 0x4063, + 0x4063, 0x4064, 0x4064, 0x4065, 0x4065, 0x4066, 0x4066, 0x4067, 0x4067, 0x4068, 0x4068, 0x4069, + 0x4069, 0x406A, 0x406A, 0x406B, 0x406C, 0x406C, 0x406D, 0x406D, 0x406E, 0x406E, 0x406F, 0x406F, + 0x4070, 0x4070, 0x4071, 0x4071, 0x4072, 0x4072, 0x4073, 0x4073, 0x4074, 0x4074, 0x4075, 0x4075, + 0x4076, 0x4076, 0x4077, 0x4077, 0x4078, 0x4078, 0x4079, 0x4079, 0x407A, 0x407A, 0x407B, 0x407B, + 0x407C, 0x407C, 0x407D, 0x407D, 0x407E, 0x407E, 0x407F, 0x407F, 0x4080, 0x4080, 0x4081, 0x4081, + 0x4081, 0x4081, 0x4082, 0x4082, 0x4082, 0x4082, 0x4083, 0x4083, 0x4083, 0x4083, 0x4084, 0x4084, + 0x4084, 0x4084, 0x4085, 0x4085, 0x4085, 0x4085, 0x4086, 0x4086, 0x4086, 0x4086, 0x4087, 0x4087, + 0x4087, 0x4087, 0x4088, 0x4088, 0x4088, 0x4088, 0x4089, 0x4089, 0x4089, 0x4089, 0x408A, 0x408A, + 0x408A, 0x408A, 0x408B, 0x408B, 0x408B, 0x408C, 0x408C, 0x408C, 0x408C, 0x408D, 0x408D, 0x408D, + 0x408D, 0x408E, 0x408E, 0x408E, 0x408E, 0x408F, 0x408F, 0x408F, 0x408F, 0x4090, 0x4090, 0x4090, + 0x4090, 0x4091, 0x4091, 0x4091, 0x4091, 0x4092, 0x4092, 0x4092, 0x4092, 0x4093, 0x4093, 0x4093, + 0x4093, 0x4094, 0x4094, 0x4094, 0x4094, 0x4095, 0x4095, 0x4095, 0x4096, 0x4096, 0x4096, 0x4096, + 0x4097, 0x4097, 0x4097, 0x4097, 0x4098, 0x4098, 0x4098, 0x4098, 0x4099, 0x4099, 0x4099, 0x4099, + 0x409A, 0x409A, 0x409A, 0x409A, 0x409B, 0x409B, 0x409B, 0x409B, 0x409C, 0x409C, 0x409C, 0x409C, + 0x409D, 0x409D, 0x409D, 0x409D, 0x409E, 0x409E, 0x409E, 0x409E, 0x409F, 0x409F, 0x409F, 0x409F, + 0x40A0, 0x40A0, 0x40A0, 0x40A1, 0x40A1, 0x40A1, 0x40A1, 0x40A2, 0x40A2, 0x40A2, 0x40A2, 0x40A3, + 0x40A3, 0x40A3, 0x40A3, 0x40A4, 0x40A4, 0x40A4, 0x40A4, 0x40A5, 0x40A5, 0x40A5, 0x40A5, 0x40A6, + 0x40A6, 0x40A6, 0x40A6, 0x40A7, 0x40A7, 0x40A7, 0x40A7, 0x40A8, 0x40A8, 0x40A8, 0x40A8, 0x40A9, + 0x40A9, 0x40A9, 0x40A9, 0x40AA, 0x40AA, 0x40AA, 0x40AA, 0x40AB, 0x40AB, 0x40AB, 0x40AC, 0x40AC, + 0x40AC, 0x40AC, 0x40AD, 0x40AD, 0x40AD, 0x40AD, 0x40AE, 0x40AE, 0x40AE, 0x40AE, 0x40AF, 0x40AF, + 0x40AF, 0x40AF, 0x40B0, 0x40B0, 0x40B0, 0x40B0, 0x40B1, 0x40B1, 0x40B1, 0x40B1, 0x40B2, 0x40B2, + 0x40B2, 0x40B2, 0x40B3, 0x40B3, 0x40B3, 0x40B3, 0x40B4, 0x40B4, 0x40B4, 0x40B4, 0x40B5, 0x40B5, + 0x40B5, 0x40B6, 0x40B6, 0x40B6, 0x40B6, 0x40B7, 0x40B7, 0x40B7, 0x40B7, 0x40B8, 0x40B8, 0x40B8, + 0x40B8, 0x40B9, 0x40B9, 0x40B9, 0x40B9, 0x40BA, 0x40BA, 0x40BA, 0x40BA, 0x40BB, 0x40BB, 0x40BB, + 0x40BB, 0x40BC, 0x40BC, 0x40BC, 0x40BC, 0x40BD, 0x40BD, 0x40BD, 0x40BD, 0x40BE, 0x40BE, 0x40BE, + 0x40BE, 0x40BF, 0x40BF, 0x40BF, 0x40BF, 0x40C0, 0x40C0, 0x40C0, 0x40C1, 0x40C1, 0x40C1, 0x40C1, + 0x40C2, 0x40C2, 0x40C2, 0x40C2, 0x40C3, 0x40C3, 0x40C3, 0x40C3, 0x40C4, 0x40C4, 0x40C4, 0x40C4, + 0x40C5, 0x40C5, 0x40C5, 0x40C5, 0x40C6, 0x40C6, 0x40C6, 0x40C6, 0x40C7, 0x40C7, 0x40C7, 0x40C7, + 0x40C8, 0x40C8, 0x40C8, 0x40C8, 0x40C9, 0x40C9, 0x40C9, 0x40C9, 0x40CA, 0x40CA, 0x40CA, 0x40CA, + 0x40CB, 0x40CB, 0x40CB, 0x40CC, 0x40CC, 0x40CC, 0x40CC, 0x40CD, 0x40CD, 0x40CD, 0x40CD, 0x40CE, + 0x40CE, 0x40CE, 0x40CE, 0x40CF, 0x40CF, 0x40CF, 0x40CF, 0x40D0, 0x40D0, 0x40D0, 0x40D0, 0x40D1, + 0x40D1, 0x40D1, 0x40D1, 0x40D2, 0x40D2, 0x40D2, 0x40D2, 0x40D3, 0x40D3, 0x40D3, 0x40D3, 0x40D4, + 0x40D4, 0x40D4, 0x40D4, 0x40D5, 0x40D5, 0x40D5, 0x40D6, 0x40D6, 0x40D6, 0x40D6, 0x40D7, 0x40D7, + 0x40D7, 0x40D7, 0x40D8, 0x40D8, 0x40D8, 0x40D8, 0x40D9, 0x40D9, 0x40D9, 0x40D9, 0x40DA, 0x40DA, + 0x40DA, 0x40DA, 0x40DB, 0x40DB, 0x40DB, 0x40DB, 0x40DC, 0x40DC, 0x40DC, 0x40DC, 0x40DD, 0x40DD, + 0x40DD, 0x40DD, 0x40DE, 0x40DE, 0x40DE, 0x40DE, 0x40DF, 0x40DF, 0x40DF, 0x40DF, 0x40E0, 0x40E0, + 0x40E0, 0x40E1, 0x40E1, 0x40E1, 0x40E1, 0x40E2, 0x40E2, 0x40E2, 0x40E2, 0x40E3, 0x40E3, 0x40E3, + 0x40E3, 0x40E4, 0x40E4, 0x40E4, 0x40E4, 0x40E5, 0x40E5, 0x40E5, 0x40E5, 0x40E6, 0x40E6, 0x40E6, + 0x40E6, 0x40E7, 0x40E7, 0x40E7, 0x40E7, 0x40E8, 0x40E8, 0x40E8, 0x40E8, 0x40E9, 0x40E9, 0x40E9, + 0x40E9, 0x40EA, 0x40EA, 0x40EA, 0x40EA, 0x40EB, 0x40EB, 0x40EB, 0x40EC, 0x40EC, 0x40EC, 0x40EC, + 0x40ED, 0x40ED, 0x40ED, 0x40ED, 0x40EE, 0x40EE, 0x40EE, 0x40EE, 0x40EF, 0x40EF, 0x40EF, 0x40EF, + 0x40F0, 0x40F0, 0x40F0, 0x40F0, 0x40F1, 0x40F1, 0x40F1, 0x40F1, 0x40F2, 0x40F2, 0x40F2, 0x40F2, + 0x40F3, 0x40F3, 0x40F3, 0x40F3, 0x40F4, 0x40F4, 0x40F4, 0x40F4, 0x40F5, 0x40F5, 0x40F5, 0x40F6, + 0x40F6, 0x40F6, 0x40F6, 0x40F7, 0x40F7, 0x40F7, 0x40F7, 0x40F8, 0x40F8, 0x40F8, 0x40F8, 0x40F9, + 0x40F9, 0x40F9, 0x40F9, 0x40FA, 0x40FA, 0x40FA, 0x40FA, 0x40FB, 0x40FB, 0x40FB, 0x40FB, 0x40FC, + 0x40FC, 0x40FC, 0x40FC, 0x40FD, 0x40FD, 0x40FD, 0x40FD, 0x40FE, 0x40FE, 0x40FE, 0x40FE, 0x40FF, + 0x40FF, 0x40FF, 0x40FF, 0x4100, 0xBC03, 0xBC83, 0xBCC5, 0xBD03, 0xBD24, 0xBD45, 0xBD65, 0xBD83, + 0xBD93, 0xBDA4, 0xBDB4, 0xBDC5, 0xBDD5, 0xBDE5, 0xBDF6, 0xBE03, 0xBE0B, 0xBE13, 0xBE1C, 0xBE24, + 0xBE2C, 0xBE34, 0xBE3C, 0xBE45, 0xBE4D, 0xBE55, 0xBE5D, 0xBE65, 0xBE6E, 0xBE76, 0xBE7E, 0xBE83, + 0xBE87, 0xBE8B, 0xBE8F, 0xBE93, 0xBE98, 0xBE9C, 0xBEA0, 0xBEA4, 0xBEA8, 0xBEAC, 0xBEB0, 0xBEB4, + 0xBEB8, 0xBEBC, 0xBEC1, 0xBEC5, 0xBEC9, 0xBECD, 0xBED1, 0xBED5, 0xBED9, 0xBEDD, 0xBEE1, 0xBEE5, + 0xBEE9, 0xBEEE, 0xBEF2, 0xBEF6, 0xBEFA, 0xBEFE, 0xBF01, 0xBF03, 0xBF05, 0xBF07, 0xBF09, 0xBF0B, + 0xBF0D, 0xBF0F, 0xBF11, 0xBF13, 0xBF16, 0xBF18, 0xBF1A, 0xBF1C, 0xBF1E, 0xBF20, 0xBF22, 0xBF24, + 0xBF26, 0xBF28, 0xBF2A, 0xBF2C, 0xBF2E, 0xBF30, 0xBF32, 0xBF34, 0xBF36, 0xBF38, 0xBF3A, 0xBF3C, + 0xBF3E, 0xBF41, 0xBF43, 0xBF45, 0xBF47, 0xBF49, 0xBF4B, 0xBF4D, 0xBF4F, 0xBF51, 0xBF53, 0xBF55, + 0xBF57, 0xBF59, 0xBF5B, 0xBF5D, 0xBF5F, 0xBF61, 0xBF63, 0xBF65, 0xBF67, 0xBF69, 0xBF6C, 0xBF6E, + 0xBF70, 0xBF72, 0xBF74, 0xBF76, 0xBF78, 0xBF7A, 0xBF7C, 0xBF7E, 0xBF80, 0xBF81, 0xBF82, 0xBF83, + 0xBF84, 0xBF85, 0xBF86, 0xBF87, 0xBF88, 0xBF89, 0xBF8A, 0xBF8B, 0xBF8C, 0xBF8D, 0xBF8E, 0xBF8F, + 0xBF90, 0xBF91, 0xBF92, 0xBF93, 0xBF94, 0xBF96, 0xBF97, 0xBF98, 0xBF99, 0xBF9A, 0xBF9B, 0xBF9C, + 0xBF9D, 0xBF9E, 0xBF9F, 0xBFA0, 0xBFA1, 0xBFA2, 0xBFA3, 0xBFA4, 0xBFA5, 0xBFA6, 0xBFA7, 0xBFA8, + 0xBFA9, 0xBFAA, 0xBFAB, 0xBFAC, 0xBFAD, 0xBFAE, 0xBFAF, 0xBFB0, 0xBFB1, 0xBFB2, 0xBFB3, 0xBFB4, + 0xBFB5, 0xBFB6, 0xBFB7, 0xBFB8, 0xBFB9, 0xBFBA, 0xBFBB, 0xBFBC, 0xBFBD, 0xBFBE, 0xBFBF, 0xBFC1, + 0xBFC2, 0xBFC3, 0xBFC4, 0xBFC5, 0xBFC6, 0xBFC7, 0xBFC8, 0xBFC9, 0xBFCA, 0xBFCB, 0xBFCC, 0xBFCD, + 0xBFCE, 0xBFCF, 0xBFD0, 0xBFD1, 0xBFD2, 0xBFD3, 0xBFD4, 0xBFD5, 0xBFD6, 0xBFD7, 0xBFD8, 0xBFD9, + 0xBFDA, 0xBFDB, 0xBFDC, 0xBFDD, 0xBFDE, 0xBFDF, 0xBFE0, 0xBFE1, 0xBFE2, 0xBFE3, 0xBFE4, 0xBFE5, + 0xBFE6, 0xBFE7, 0xBFE8, 0xBFE9, 0xBFEA, 0xBFEC, 0xBFED, 0xBFEE, 0xBFEF, 0xBFF0, 0xBFF1, 0xBFF2, + 0xBFF3, 0xBFF4, 0xBFF5, 0xBFF6, 0xBFF7, 0xBFF8, 0xBFF9, 0xBFFA, 0xBFFB, 0xBFFC, 0xBFFD, 0xBFFE, + 0xBFFF, 0xC000, 0xC001, 0xC001, 0xC002, 0xC002, 0xC003, 0xC003, 0xC004, 0xC004, 0xC005, 0xC005, + 0xC006, 0xC006, 0xC007, 0xC007, 0xC008, 0xC008, 0xC009, 0xC009, 0xC00A, 0xC00A, 0xC00B, 0xC00B, + 0xC00C, 0xC00C, 0xC00D, 0xC00D, 0xC00E, 0xC00E, 0xC00F, 0xC00F, 0xC010, 0xC010, 0xC011, 0xC011, + 0xC012, 0xC012, 0xC013, 0xC013, 0xC014, 0xC014, 0xC015, 0xC016, 0xC016, 0xC017, 0xC017, 0xC018, + 0xC018, 0xC019, 0xC019, 0xC01A, 0xC01A, 0xC01B, 0xC01B, 0xC01C, 0xC01C, 0xC01D, 0xC01D, 0xC01E, + 0xC01E, 0xC01F, 0xC01F, 0xC020, 0xC020, 0xC021, 0xC021, 0xC022, 0xC022, 0xC023, 0xC023, 0xC024, + 0xC024, 0xC025, 0xC025, 0xC026, 0xC026, 0xC027, 0xC027, 0xC028, 0xC028, 0xC029, 0xC029, 0xC02A, + 0xC02A, 0xC02B, 0xC02C, 0xC02C, 0xC02D, 0xC02D, 0xC02E, 0xC02E, 0xC02F, 0xC02F, 0xC030, 0xC030, + 0xC031, 0xC031, 0xC032, 0xC032, 0xC033, 0xC033, 0xC034, 0xC034, 0xC035, 0xC035, 0xC036, 0xC036, + 0xC037, 0xC037, 0xC038, 0xC038, 0xC039, 0xC039, 0xC03A, 0xC03A, 0xC03B, 0xC03B, 0xC03C, 0xC03C, + 0xC03D, 0xC03D, 0xC03E, 0xC03E, 0xC03F, 0xC03F, 0xC040, 0xC041, 0xC041, 0xC042, 0xC042, 0xC043, + 0xC043, 0xC044, 0xC044, 0xC045, 0xC045, 0xC046, 0xC046, 0xC047, 0xC047, 0xC048, 0xC048, 0xC049, + 0xC049, 0xC04A, 0xC04A, 0xC04B, 0xC04B, 0xC04C, 0xC04C, 0xC04D, 0xC04D, 0xC04E, 0xC04E, 0xC04F, + 0xC04F, 0xC050, 0xC050, 0xC051, 0xC051, 0xC052, 0xC052, 0xC053, 0xC053, 0xC054, 0xC054, 0xC055, + 0xC056, 0xC056, 0xC057, 0xC057, 0xC058, 0xC058, 0xC059, 0xC059, 0xC05A, 0xC05A, 0xC05B, 0xC05B, + 0xC05C, 0xC05C, 0xC05D, 0xC05D, 0xC05E, 0xC05E, 0xC05F, 0xC05F, 0xC060, 0xC060, 0xC061, 0xC061, + 0xC062, 0xC062, 0xC063, 0xC063, 0xC064, 0xC064, 0xC065, 0xC065, 0xC066, 0xC066, 0xC067, 0xC067, + 0xC068, 0xC068, 0xC069, 0xC069, 0xC06A, 0xC06A, 0xC06B, 0xC06C, 0xC06C, 0xC06D, 0xC06D, 0xC06E, + 0xC06E, 0xC06F, 0xC06F, 0xC070, 0xC070, 0xC071, 0xC071, 0xC072, 0xC072, 0xC073, 0xC073, 0xC074, + 0xC074, 0xC075, 0xC075, 0xC076, 0xC076, 0xC077, 0xC077, 0xC078, 0xC078, 0xC079, 0xC079, 0xC07A, + 0xC07A, 0xC07B, 0xC07B, 0xC07C, 0xC07C, 0xC07D, 0xC07D, 0xC07E, 0xC07E, 0xC07F, 0xC07F, 0xC080, + 0xC080, 0xC081, 0xC081, 0xC081, 0xC081, 0xC082, 0xC082, 0xC082, 0xC082, 0xC083, 0xC083, 0xC083, + 0xC083, 0xC084, 0xC084, 0xC084, 0xC084, 0xC085, 0xC085, 0xC085, 0xC085, 0xC086, 0xC086, 0xC086, + 0xC086, 0xC087, 0xC087, 0xC087, 0xC087, 0xC088, 0xC088, 0xC088, 0xC088, 0xC089, 0xC089, 0xC089, + 0xC089, 0xC08A, 0xC08A, 0xC08A, 0xC08A, 0xC08B, 0xC08B, 0xC08B, 0xC08C, 0xC08C, 0xC08C, 0xC08C, + 0xC08D, 0xC08D, 0xC08D, 0xC08D, 0xC08E, 0xC08E, 0xC08E, 0xC08E, 0xC08F, 0xC08F, 0xC08F, 0xC08F, + 0xC090, 0xC090, 0xC090, 0xC090, 0xC091, 0xC091, 0xC091, 0xC091, 0xC092, 0xC092, 0xC092, 0xC092, + 0xC093, 0xC093, 0xC093, 0xC093, 0xC094, 0xC094, 0xC094, 0xC094, 0xC095, 0xC095, 0xC095, 0xC096, + 0xC096, 0xC096, 0xC096, 0xC097, 0xC097, 0xC097, 0xC097, 0xC098, 0xC098, 0xC098, 0xC098, 0xC099, + 0xC099, 0xC099, 0xC099, 0xC09A, 0xC09A, 0xC09A, 0xC09A, 0xC09B, 0xC09B, 0xC09B, 0xC09B, 0xC09C, + 0xC09C, 0xC09C, 0xC09C, 0xC09D, 0xC09D, 0xC09D, 0xC09D, 0xC09E, 0xC09E, 0xC09E, 0xC09E, 0xC09F, + 0xC09F, 0xC09F, 0xC09F, 0xC0A0, 0xC0A0, 0xC0A0, 0xC0A1, 0xC0A1, 0xC0A1, 0xC0A1, 0xC0A2, 0xC0A2, + 0xC0A2, 0xC0A2, 0xC0A3, 0xC0A3, 0xC0A3, 0xC0A3, 0xC0A4, 0xC0A4, 0xC0A4, 0xC0A4, 0xC0A5, 0xC0A5, + 0xC0A5, 0xC0A5, 0xC0A6, 0xC0A6, 0xC0A6, 0xC0A6, 0xC0A7, 0xC0A7, 0xC0A7, 0xC0A7, 0xC0A8, 0xC0A8, + 0xC0A8, 0xC0A8, 0xC0A9, 0xC0A9, 0xC0A9, 0xC0A9, 0xC0AA, 0xC0AA, 0xC0AA, 0xC0AA, 0xC0AB, 0xC0AB, + 0xC0AB, 0xC0AC, 0xC0AC, 0xC0AC, 0xC0AC, 0xC0AD, 0xC0AD, 0xC0AD, 0xC0AD, 0xC0AE, 0xC0AE, 0xC0AE, + 0xC0AE, 0xC0AF, 0xC0AF, 0xC0AF, 0xC0AF, 0xC0B0, 0xC0B0, 0xC0B0, 0xC0B0, 0xC0B1, 0xC0B1, 0xC0B1, + 0xC0B1, 0xC0B2, 0xC0B2, 0xC0B2, 0xC0B2, 0xC0B3, 0xC0B3, 0xC0B3, 0xC0B3, 0xC0B4, 0xC0B4, 0xC0B4, + 0xC0B4, 0xC0B5, 0xC0B5, 0xC0B5, 0xC0B6, 0xC0B6, 0xC0B6, 0xC0B6, 0xC0B7, 0xC0B7, 0xC0B7, 0xC0B7, + 0xC0B8, 0xC0B8, 0xC0B8, 0xC0B8, 0xC0B9, 0xC0B9, 0xC0B9, 0xC0B9, 0xC0BA, 0xC0BA, 0xC0BA, 0xC0BA, + 0xC0BB, 0xC0BB, 0xC0BB, 0xC0BB, 0xC0BC, 0xC0BC, 0xC0BC, 0xC0BC, 0xC0BD, 0xC0BD, 0xC0BD, 0xC0BD, + 0xC0BE, 0xC0BE, 0xC0BE, 0xC0BE, 0xC0BF, 0xC0BF, 0xC0BF, 0xC0BF, 0xC0C0, 0xC0C0, 0xC0C0, 0xC0C1, + 0xC0C1, 0xC0C1, 0xC0C1, 0xC0C2, 0xC0C2, 0xC0C2, 0xC0C2, 0xC0C3, 0xC0C3, 0xC0C3, 0xC0C3, 0xC0C4, + 0xC0C4, 0xC0C4, 0xC0C4, 0xC0C5, 0xC0C5, 0xC0C5, 0xC0C5, 0xC0C6, 0xC0C6, 0xC0C6, 0xC0C6, 0xC0C7, + 0xC0C7, 0xC0C7, 0xC0C7, 0xC0C8, 0xC0C8, 0xC0C8, 0xC0C8, 0xC0C9, 0xC0C9, 0xC0C9, 0xC0C9, 0xC0CA, + 0xC0CA, 0xC0CA, 0xC0CA, 0xC0CB, 0xC0CB, 0xC0CB, 0xC0CC, 0xC0CC, 0xC0CC, 0xC0CC, 0xC0CD, 0xC0CD, + 0xC0CD, 0xC0CD, 0xC0CE, 0xC0CE, 0xC0CE, 0xC0CE, 0xC0CF, 0xC0CF, 0xC0CF, 0xC0CF, 0xC0D0, 0xC0D0, + 0xC0D0, 0xC0D0, 0xC0D1, 0xC0D1, 0xC0D1, 0xC0D1, 0xC0D2, 0xC0D2, 0xC0D2, 0xC0D2, 0xC0D3, 0xC0D3, + 0xC0D3, 0xC0D3, 0xC0D4, 0xC0D4, 0xC0D4, 0xC0D4, 0xC0D5, 0xC0D5, 0xC0D5, 0xC0D6, 0xC0D6, 0xC0D6, + 0xC0D6, 0xC0D7, 0xC0D7, 0xC0D7, 0xC0D7, 0xC0D8, 0xC0D8, 0xC0D8, 0xC0D8, 0xC0D9, 0xC0D9, 0xC0D9, + 0xC0D9, 0xC0DA, 0xC0DA, 0xC0DA, 0xC0DA, 0xC0DB, 0xC0DB, 0xC0DB, 0xC0DB, 0xC0DC, 0xC0DC, 0xC0DC, + 0xC0DC, 0xC0DD, 0xC0DD, 0xC0DD, 0xC0DD, 0xC0DE, 0xC0DE, 0xC0DE, 0xC0DE, 0xC0DF, 0xC0DF, 0xC0DF, + 0xC0DF, 0xC0E0, 0xC0E0, 0xC0E0, 0xC0E1, 0xC0E1, 0xC0E1, 0xC0E1, 0xC0E2, 0xC0E2, 0xC0E2, 0xC0E2, + 0xC0E3, 0xC0E3, 0xC0E3, 0xC0E3, 0xC0E4, 0xC0E4, 0xC0E4, 0xC0E4, 0xC0E5, 0xC0E5, 0xC0E5, 0xC0E5, + 0xC0E6, 0xC0E6, 0xC0E6, 0xC0E6, 0xC0E7, 0xC0E7, 0xC0E7, 0xC0E7, 0xC0E8, 0xC0E8, 0xC0E8, 0xC0E8, + 0xC0E9, 0xC0E9, 0xC0E9, 0xC0E9, 0xC0EA, 0xC0EA, 0xC0EA, 0xC0EA, 0xC0EB, 0xC0EB, 0xC0EB, 0xC0EC, + 0xC0EC, 0xC0EC, 0xC0EC, 0xC0ED, 0xC0ED, 0xC0ED, 0xC0ED, 0xC0EE, 0xC0EE, 0xC0EE, 0xC0EE, 0xC0EF, + 0xC0EF, 0xC0EF, 0xC0EF, 0xC0F0, 0xC0F0, 0xC0F0, 0xC0F0, 0xC0F1, 0xC0F1, 0xC0F1, 0xC0F1, 0xC0F2, + 0xC0F2, 0xC0F2, 0xC0F2, 0xC0F3, 0xC0F3, 0xC0F3, 0xC0F3, 0xC0F4, 0xC0F4, 0xC0F4, 0xC0F4, 0xC0F5, + 0xC0F5, 0xC0F5, 0xC0F6, 0xC0F6, 0xC0F6, 0xC0F6, 0xC0F7, 0xC0F7, 0xC0F7, 0xC0F7, 0xC0F8, 0xC0F8, + 0xC0F8, 0xC0F8, 0xC0F9, 0xC0F9, 0xC0F9, 0xC0F9, 0xC0FA, 0xC0FA, 0xC0FA, 0xC0FA, 0xC0FB, 0xC0FB, + 0xC0FB, 0xC0FB, 0xC0FC, 0xC0FC, 0xC0FC, 0xC0FC, 0xC0FD, 0xC0FD, 0xC0FD, 0xC0FD, 0xC0FE, 0xC0FE, + 0xC0FE, 0xC0FE, 0xC0FF, 0xC0FF, 0xC0FF, 0xC0FF, 0xC100, 0xC100, +}; + +static uint16_t sigmode_golden_bf16[] = { + 0x3f00, 0x3f01, 0x3f01, 0x3f02, 0x3f02, 0x3f03, 0x3f03, 0x3f04, 0x3f04, 0x3f05, 0x3f05, 0x3f06, + 0x3f06, 0x3f07, 0x3f07, 0x3f08, 0x3f08, 0x3f09, 0x3f09, 0x3f0a, 0x3f0a, 0x3f0b, 0x3f0b, 0x3f0c, + 0x3f0c, 0x3f0d, 0x3f0d, 0x3f0e, 0x3f0e, 0x3f0f, 0x3f0f, 0x3f10, 0x3f10, 0x3f11, 0x3f11, 0x3f12, + 0x3f12, 0x3f13, 0x3f13, 0x3f14, 0x3f14, 0x3f15, 0x3f15, 0x3f16, 0x3f16, 0x3f17, 0x3f17, 0x3f18, + 0x3f19, 0x3f19, 0x3f1a, 0x3f1a, 0x3f1b, 0x3f1b, 0x3f1b, 0x3f1c, 0x3f1d, 0x3f1d, 0x3f1e, 0x3f1e, + 0x3f1f, 0x3f1f, 0x3f20, 0x3f1f, 0x3f20, 0x3f20, 0x3f21, 0x3f21, 0x3f22, 0x3f22, 0x3f23, 0x3f23, + 0x3f24, 0x3f24, 0x3f25, 0x3f25, 0x3f26, 0x3f26, 0x3f27, 0x3f27, 0x3f28, 0x3f28, 0x3f29, 0x3f29, + 0x3f2a, 0x3f2a, 0x3f2a, 0x3f2a, 0x3f2b, 0x3f2b, 0x3f2c, 0x3f2c, 0x3f2d, 0x3f2d, 0x3f2e, 0x3f2f, + 0x3f2f, 0x3f30, 0x3f30, 0x3f30, 0x3f31, 0x3f31, 0x3f31, 0x3f32, 0x3f32, 0x3f32, 0x3f33, 0x3f33, + 0x3f34, 0x3f34, 0x3f35, 0x3f36, 0x3f36, 0x3f36, 0x3f37, 0x3f37, 0x3f38, 0x3f38, 0x3f38, 0x3f39, + 0x3f39, 0x3f3a, 0x3f3a, 0x3f3a, 0x3f3b, 0x3f3b, 0x3f3b, 0x3f3c, 0x3f3c, 0x3f3d, 0x3f3d, 0x3f3d, + 0x3f3e, 0x3f3e, 0x3f3e, 0x3f3f, 0x3f3f, 0x3f40, 0x3f40, 0x3f40, 0x3f41, 0x3f41, 0x3f41, 0x3f42, + 0x3f42, 0x3f42, 0x3f43, 0x3f44, 0x3f44, 0x3f44, 0x3f45, 0x3f45, 0x3f45, 0x3f46, 0x3f46, 0x3f46, + 0x3f47, 0x3f47, 0x3f48, 0x3f48, 0x3f48, 0x3f49, 0x3f49, 0x3f49, 0x3f4a, 0x3f4a, 0x3f4b, 0x3f4b, + 0x3f4b, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4d, 0x3f4d, 0x3f4d, 0x3f4e, 0x3f4e, 0x3f4e, + 0x3f4f, 0x3f4f, 0x3f50, 0x3f50, 0x3f50, 0x3f51, 0x3f51, 0x3f51, 0x3f51, 0x3f52, 0x3f52, 0x3f52, + 0x3f52, 0x3f53, 0x3f53, 0x3f54, 0x3f54, 0x3f55, 0x3f55, 0x3f55, 0x3f55, 0x3f56, 0x3f56, 0x3f56, + 0x3f56, 0x3f57, 0x3f57, 0x3f57, 0x3f57, 0x3f58, 0x3f58, 0x3f58, 0x3f58, 0x3f59, 0x3f59, 0x3f59, + 0x3f59, 0x3f5a, 0x3f5a, 0x3f5a, 0x3f5a, 0x3f5a, 0x3f5b, 0x3f5b, 0x3f5b, 0x3f5b, 0x3f5c, 0x3f5c, + 0x3f5c, 0x3f5c, 0x3f5d, 0x3f5d, 0x3f5d, 0x3f5e, 0x3f5e, 0x3f5e, 0x3f5e, 0x3f5f, 0x3f5f, 0x3f5f, + 0x3f5f, 0x3f60, 0x3f60, 0x3f60, 0x3f60, 0x3f61, 0x3f61, 0x3f61, 0x3f61, 0x3f62, 0x3f61, 0x3f61, + 0x3f61, 0x3f62, 0x3f62, 0x3f62, 0x3f62, 0x3f63, 0x3f63, 0x3f63, 0x3f63, 0x3f64, 0x3f64, 0x3f64, + 0x3f64, 0x3f65, 0x3f65, 0x3f65, 0x3f65, 0x3f66, 0x3f66, 0x3f66, 0x3f66, 0x3f66, 0x3f66, 0x3f66, + 0x3f66, 0x3f67, 0x3f67, 0x3f67, 0x3f67, 0x3f68, 0x3f68, 0x3f68, 0x3f68, 0x3f69, 0x3f69, 0x3f69, + 0x3f69, 0x3f69, 0x3f69, 0x3f69, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a, + 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6c, 0x3f6c, 0x3f6c, 0x3f6c, + 0x3f6d, 0x3f6d, 0x3f6d, 0x3f6d, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, + 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f70, 0x3f70, + 0x3f70, 0x3f70, 0x3f70, 0x3f70, 0x3f70, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, + 0x3f71, 0x3f72, 0x3f72, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f72, 0x3f72, 0x3f72, + 0x3f72, 0x3f72, 0x3f72, 0x3f72, 0x3f72, 0x3f73, 0x3f73, 0x3f73, 0x3f73, 0x3f73, 0x3f73, 0x3f73, + 0x3f73, 0x3f74, 0x3f74, 0x3f74, 0x3f74, 0x3f74, 0x3f74, 0x3f74, 0x3f75, 0x3f75, 0x3f75, 0x3f75, + 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, + 0x3f75, 0x3f75, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, + 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f77, 0x3f77, 0x3f77, 0x3f77, 0x3f77, 0x3f77, + 0x3f77, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, + 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, + 0x3f78, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, + 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, + 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, + 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, + 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, + 0x3f7b, 0x3f7b, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, + 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, + 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7d, 0x3f7d, + 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, + 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, + 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, + 0x3f7d, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, + 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, + 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, + 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, + 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, + 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7f, + 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, + 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, + 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, + 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, + 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, + 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, + 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, + 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, + 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, + 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, + 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, + 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3eff, 0x3efe, 0x3efd, 0x3efc, 0x3efb, 0x3efa, 0x3ef9, 0x3ef8, + 0x3ef7, 0x3ef6, 0x3ef5, 0x3ef4, 0x3ef3, 0x3ef2, 0x3ef1, 0x3ef0, 0x3eef, 0x3eee, 0x3eed, 0x3eec, + 0x3eeb, 0x3eea, 0x3ee9, 0x3ee7, 0x3ee6, 0x3ee5, 0x3ee4, 0x3ee3, 0x3ee2, 0x3ee1, 0x3ee0, 0x3edf, + 0x3ede, 0x3edd, 0x3edc, 0x3edb, 0x3eda, 0x3ed9, 0x3ed8, 0x3ed7, 0x3ed6, 0x3ed5, 0x3ed4, 0x3ed3, + 0x3ed2, 0x3ed1, 0x3ed1, 0x3ed0, 0x3ecf, 0x3ece, 0x3ecd, 0x3ecc, 0x3ecb, 0x3eca, 0x3ec9, 0x3ec8, + 0x3ec7, 0x3ec6, 0x3ec5, 0x3ec4, 0x3ec3, 0x3ec2, 0x3ec1, 0x3ec0, 0x3ebf, 0x3ebe, 0x3ebd, 0x3ebc, + 0x3ebb, 0x3eba, 0x3eba, 0x3eb9, 0x3eb7, 0x3eb6, 0x3eb5, 0x3eb4, 0x3eb4, 0x3eb3, 0x3eb2, 0x3eb1, + 0x3eb0, 0x3eaf, 0x3eaf, 0x3eae, 0x3ead, 0x3eab, 0x3eaa, 0x3ea9, 0x3ea8, 0x3ea7, 0x3ea7, 0x3ea6, + 0x3ea5, 0x3ea4, 0x3ea3, 0x3ea2, 0x3ea1, 0x3ea0, 0x3e9f, 0x3e9e, 0x3e9e, 0x3e9d, 0x3e9c, 0x3e9b, + 0x3e9a, 0x3e99, 0x3e98, 0x3e98, 0x3e97, 0x3e97, 0x3e96, 0x3e95, 0x3e94, 0x3e93, 0x3e92, 0x3e91, + 0x3e90, 0x3e8f, 0x3e8e, 0x3e8e, 0x3e8d, 0x3e8c, 0x3e8b, 0x3e8a, 0x3e8a, 0x3e89, 0x3e88, 0x3e88, + 0x3e87, 0x3e86, 0x3e85, 0x3e85, 0x3e83, 0x3e82, 0x3e82, 0x3e81, 0x3e80, 0x3e7e, 0x3e7d, 0x3e7c, + 0x3e7b, 0x3e7a, 0x3e78, 0x3e77, 0x3e75, 0x3e72, 0x3e71, 0x3e6f, 0x3e6e, 0x3e6c, 0x3e6b, 0x3e69, + 0x3e68, 0x3e67, 0x3e65, 0x3e64, 0x3e63, 0x3e61, 0x3e60, 0x3e5f, 0x3e5d, 0x3e5c, 0x3e5a, 0x3e59, + 0x3e58, 0x3e56, 0x3e55, 0x3e54, 0x3e52, 0x3e51, 0x3e50, 0x3e4f, 0x3e4e, 0x3e4c, 0x3e4b, 0x3e4a, + 0x3e49, 0x3e47, 0x3e46, 0x3e45, 0x3e44, 0x3e43, 0x3e41, 0x3e40, 0x3e3f, 0x3e3e, 0x3e3c, 0x3e3a, + 0x3e39, 0x3e37, 0x3e36, 0x3e35, 0x3e34, 0x3e33, 0x3e31, 0x3e30, 0x3e2f, 0x3e2e, 0x3e2c, 0x3e2b, + 0x3e2a, 0x3e29, 0x3e28, 0x3e27, 0x3e26, 0x3e25, 0x3e24, 0x3e23, 0x3e22, 0x3e20, 0x3e20, 0x3e1f, + 0x3e1e, 0x3e1d, 0x3e1c, 0x3e1b, 0x3e1a, 0x3e19, 0x3e18, 0x3e17, 0x3e16, 0x3e15, 0x3e14, 0x3e13, + 0x3e12, 0x3e11, 0x3e10, 0x3e0f, 0x3e0e, 0x3e0c, 0x3e0b, 0x3e0a, 0x3e09, 0x3e08, 0x3e07, 0x3e06, + 0x3e05, 0x3e04, 0x3e03, 0x3e03, 0x3e02, 0x3e01, 0x3e00, 0x3dff, 0x3dfd, 0x3dfb, 0x3df9, 0x3df8, + 0x3df6, 0x3df4, 0x3df1, 0x3df1, 0x3ded, 0x3ded, 0x3dea, 0x3dea, 0x3de7, 0x3de7, 0x3de4, 0x3de4, + 0x3de1, 0x3de1, 0x3dde, 0x3dde, 0x3ddb, 0x3ddb, 0x3dd8, 0x3dd8, 0x3dd5, 0x3dd5, 0x3dd2, 0x3dd2, + 0x3dcf, 0x3dcf, 0x3dcc, 0x3dcc, 0x3dc9, 0x3dc9, 0x3dc7, 0x3dc7, 0x3dc3, 0x3dc3, 0x3dc0, 0x3dc0, + 0x3dbe, 0x3dbe, 0x3dbb, 0x3dbb, 0x3db9, 0x3db9, 0x3db6, 0x3db4, 0x3db4, 0x3db1, 0x3db1, 0x3dae, + 0x3dae, 0x3dac, 0x3dac, 0x3da9, 0x3da9, 0x3da7, 0x3da7, 0x3da5, 0x3da5, 0x3da3, 0x3da3, 0x3da0, + 0x3da0, 0x3d9e, 0x3d9e, 0x3d9b, 0x3d9b, 0x3d99, 0x3d99, 0x3d97, 0x3d97, 0x3d94, 0x3d94, 0x3d93, + 0x3d93, 0x3d91, 0x3d91, 0x3d8f, 0x3d8f, 0x3d8d, 0x3d8d, 0x3d8a, 0x3d8a, 0x3d88, 0x3d88, 0x3d86, + 0x3d86, 0x3d84, 0x3d82, 0x3d82, 0x3d80, 0x3d80, 0x3d7d, 0x3d7d, 0x3d79, 0x3d79, 0x3d76, 0x3d76, + 0x3d72, 0x3d72, 0x3d6f, 0x3d6f, 0x3d6b, 0x3d6b, 0x3d68, 0x3d68, 0x3d65, 0x3d65, 0x3d61, 0x3d61, + 0x3d5e, 0x3d5e, 0x3d5b, 0x3d5b, 0x3d58, 0x3d58, 0x3d55, 0x3d55, 0x3d52, 0x3d52, 0x3d4e, 0x3d4e, + 0x3d4b, 0x3d4b, 0x3d48, 0x3d48, 0x3d45, 0x3d45, 0x3d42, 0x3d3f, 0x3d3f, 0x3d3c, 0x3d3c, 0x3d3a, + 0x3d3a, 0x3d37, 0x3d37, 0x3d34, 0x3d34, 0x3d32, 0x3d32, 0x3d2f, 0x3d2f, 0x3d2c, 0x3d2c, 0x3d2a, + 0x3d2a, 0x3d27, 0x3d27, 0x3d24, 0x3d24, 0x3d22, 0x3d22, 0x3d20, 0x3d20, 0x3d1d, 0x3d1d, 0x3d1b, + 0x3d1b, 0x3d19, 0x3d19, 0x3d17, 0x3d17, 0x3d15, 0x3d15, 0x3d12, 0x3d12, 0x3d10, 0x3d10, 0x3d0e, + 0x3d0c, 0x3d0c, 0x3d0a, 0x3d0a, 0x3d08, 0x3d08, 0x3d06, 0x3d06, 0x3d04, 0x3d04, 0x3d02, 0x3d02, + 0x3cff, 0x3cff, 0x3cfb, 0x3cfb, 0x3cf8, 0x3cf8, 0x3cf4, 0x3cf4, 0x3cf0, 0x3cf0, 0x3cec, 0x3cec, + 0x3ce9, 0x3ce9, 0x3ce5, 0x3ce5, 0x3ce2, 0x3ce2, 0x3cdf, 0x3cdf, 0x3cdb, 0x3cdb, 0x3cd8, 0x3cd8, + 0x3cd5, 0x3cd5, 0x3cd2, 0x3cd2, 0x3ccf, 0x3ccf, 0x3ccc, 0x3cc8, 0x3cc8, 0x3cc5, 0x3cc5, 0x3cc2, + 0x3cc2, 0x3cbf, 0x3cbf, 0x3cbc, 0x3cbc, 0x3cb9, 0x3cb9, 0x3cb6, 0x3cb6, 0x3cb4, 0x3cb4, 0x3cb1, + 0x3cb1, 0x3cae, 0x3cae, 0x3cac, 0x3cac, 0x3ca9, 0x3ca9, 0x3ca7, 0x3ca7, 0x3ca5, 0x3ca5, 0x3ca2, + 0x3ca2, 0x3ca0, 0x3ca0, 0x3c9d, 0x3c9d, 0x3c9b, 0x3c9b, 0x3c98, 0x3c98, 0x3c96, 0x3c96, 0x3c93, + 0x3c93, 0x3c8f, 0x3c8f, 0x3c8f, 0x3c8f, 0x3c8b, 0x3c8b, 0x3c8b, 0x3c8b, 0x3c87, 0x3c87, 0x3c87, + 0x3c87, 0x3c82, 0x3c82, 0x3c82, 0x3c82, 0x3c7c, 0x3c7c, 0x3c7c, 0x3c7c, 0x3c75, 0x3c75, 0x3c75, + 0x3c75, 0x3c6e, 0x3c6e, 0x3c6e, 0x3c6e, 0x3c66, 0x3c66, 0x3c66, 0x3c66, 0x3c5f, 0x3c5f, 0x3c5f, + 0x3c5f, 0x3c59, 0x3c59, 0x3c59, 0x3c59, 0x3c53, 0x3c53, 0x3c53, 0x3c4c, 0x3c4c, 0x3c4c, 0x3c4c, + 0x3c46, 0x3c46, 0x3c46, 0x3c46, 0x3c3f, 0x3c3f, 0x3c3f, 0x3c3f, 0x3c39, 0x3c39, 0x3c39, 0x3c39, + 0x3c34, 0x3c34, 0x3c34, 0x3c34, 0x3c2f, 0x3c2f, 0x3c2f, 0x3c2f, 0x3c29, 0x3c29, 0x3c29, 0x3c29, + 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c1f, 0x3c1f, 0x3c1f, 0x3c1f, 0x3c1a, 0x3c1a, 0x3c1a, 0x3c16, + 0x3c16, 0x3c16, 0x3c16, 0x3c12, 0x3c12, 0x3c12, 0x3c12, 0x3c0d, 0x3c0d, 0x3c0d, 0x3c0d, 0x3c09, + 0x3c09, 0x3c09, 0x3c09, 0x3c04, 0x3c04, 0x3c04, 0x3c04, 0x3c00, 0x3c00, 0x3c00, 0x3c00, 0x3bf8, + 0x3bf8, 0x3bf8, 0x3bf8, 0x3bf1, 0x3bf1, 0x3bf1, 0x3bf1, 0x3be9, 0x3be9, 0x3be9, 0x3be9, 0x3be2, + 0x3be2, 0x3be2, 0x3be2, 0x3bdb, 0x3bdb, 0x3bdb, 0x3bd4, 0x3bd4, 0x3bd4, 0x3bd4, 0x3bce, 0x3bce, + 0x3bce, 0x3bce, 0x3bc8, 0x3bc8, 0x3bc8, 0x3bc8, 0x3bc2, 0x3bc2, 0x3bc2, 0x3bc2, 0x3bbc, 0x3bbc, + 0x3bbc, 0x3bbc, 0x3bb6, 0x3bb6, 0x3bb6, 0x3bb6, 0x3bb0, 0x3bb0, 0x3bb0, 0x3bb0, 0x3bab, 0x3bab, + 0x3bab, 0x3bab, 0x3ba6, 0x3ba6, 0x3ba6, 0x3ba6, 0x3ba1, 0x3ba1, 0x3ba1, 0x3ba1, 0x3b9c, 0x3b9c, + 0x3b9c, 0x3b97, 0x3b97, 0x3b97, 0x3b97, 0x3b92, 0x3b92, 0x3b92, 0x3b92, 0x3b8e, 0x3b8e, 0x3b8e, + 0x3b8e, 0x3b8a, 0x3b8a, 0x3b8a, 0x3b8a, 0x3b85, 0x3b85, 0x3b85, 0x3b85, 0x3b81, 0x3b81, 0x3b81, + 0x3b81, 0x3b7b, 0x3b7b, 0x3b7b, 0x3b7b, 0x3b73, 0x3b73, 0x3b73, 0x3b73, 0x3b6c, 0x3b6c, 0x3b6c, + 0x3b6c, 0x3b65, 0x3b65, 0x3b65, 0x3b5d, 0x3b5d, 0x3b5d, 0x3b5d, 0x3b56, 0x3b56, 0x3b56, 0x3b56, + 0x3b50, 0x3b50, 0x3b50, 0x3b50, 0x3b4a, 0x3b4a, 0x3b4a, 0x3b4a, 0x3b43, 0x3b43, 0x3b43, 0x3b43, + 0x3b3d, 0x3b3d, 0x3b3d, 0x3b3d, 0x3b38, 0x3b38, 0x3b38, 0x3b38, 0x3b32, 0x3b32, 0x3b32, 0x3b32, + 0x3b2c, 0x3b2c, 0x3b2c, 0x3b2c, 0x3b27, 0x3b27, 0x3b27, 0x3b27, 0x3b22, 0x3b22, 0x3b22, 0x3b1d, + 0x3b1d, 0x3b1d, 0x3b1d, 0x3b18, 0x3b18, 0x3b18, 0x3b18, 0x3b13, 0x3b13, 0x3b13, 0x3b13, 0x3b0f, + 0x3b0f, 0x3b0f, 0x3b0f, 0x3b0b, 0x3b0b, 0x3b0b, 0x3b0b, 0x3b06, 0x3b06, 0x3b06, 0x3b06, 0x3b02, + 0x3b02, 0x3b02, 0x3b02, 0x3afd, 0x3afd, 0x3afd, 0x3afd, 0x3af5, 0x3af5, 0x3af5, 0x3af5, 0x3aed, + 0x3aed, 0x3aed, 0x3aed, 0x3ae6, 0x3ae6, 0x3ae6, 0x3adf, 0x3adf, 0x3adf, 0x3adf, 0x3ad8, 0x3ad8, + 0x3ad8, 0x3ad8, 0x3ad1, 0x3ad1, 0x3ad1, 0x3ad1, 0x3acb, 0x3acb, 0x3acb, 0x3acb, 0x3ac5, 0x3ac5, + 0x3ac5, 0x3ac5, 0x3abf, 0x3abf, 0x3abf, 0x3abf, 0x3ab9, 0x3ab9, 0x3ab9, 0x3ab9, 0x3ab3, 0x3ab3, + 0x3ab3, 0x3ab3, 0x3aae, 0x3aae, 0x3aae, 0x3aae, 0x3aa9, 0x3aa9, 0x3aa9, 0x3aa3, 0x3aa3, 0x3aa3, + 0x3aa3, 0x3a9e, 0x3a9e, 0x3a9e, 0x3a9e, 0x3a99, 0x3a99, 0x3a99, 0x3a99, 0x3a94, 0x3a94, 0x3a94, + 0x3a94, 0x3a90, 0x3a90, 0x3a90, 0x3a90, 0x3a8c, 0x3a8c, 0x3a8c, 0x3a8c, 0x3a87, 0x3a87, 0x3a87, + 0x3a87, 0x3a83, 0x3a83, 0x3a83, 0x3a83, 0x3a7e, 0x3a7e, 0x3a7e, 0x3a7e, 0x3a76, 0x3a76, 0x3a76, + 0x3a76, 0x3a6f, 0x3a6f, 0x3a6f, 0x3a68, 0x3a68, 0x3a68, 0x3a68, 0x3a60, 0x3a60, 0x3a60, 0x3a60, + 0x3a59, 0x3a59, 0x3a59, 0x3a59, 0x3a53, 0x3a53, 0x3a53, 0x3a53, 0x3a4d, 0x3a4d, 0x3a4d, 0x3a4d, + 0x3a46, 0x3a46, 0x3a46, 0x3a46, 0x3a40, 0x3a40, 0x3a40, 0x3a40, 0x3a3a, 0x3a3a, 0x3a3a, 0x3a3a, + 0x3a34, 0x3a34, 0x3a34, 0x3a34, 0x3a2f, 0x3a2f, 0x3a2f, 0x3a2f, 0x3a2a, 0x3a2a, 0x3a2a, 0x3a24, + 0x3a24, 0x3a24, 0x3a24, 0x3a1f, 0x3a1f, 0x3a1f, 0x3a1f, 0x3a1a, 0x3a1a, 0x3a1a, 0x3a1a, 0x3a15, + 0x3a15, 0x3a15, 0x3a15, 0x3a11, 0x3a11, 0x3a11, 0x3a11, 0x3a0d, 0x3a0d, 0x3a0d, 0x3a0d, 0x3a08, + 0x3a08, 0x3a08, 0x3a08, 0x3a04, 0x3a04, 0x3a04, 0x3a04, 0x3a00, 0x3a00, 0x3a00, 0x3a00, 0x39f8, + 0x39f8, 0x39f8, 0x39f0, 0x39f0, 0x39f0, 0x39f0, 0x39e9, 0x39e9, 0x39e9, 0x39e9, 0x39e2, 0x39e2, + 0x39e2, 0x39e2, 0x39db, 0x39db, 0x39db, 0x39db, 0x39d4, 0x39d4, 0x39d4, 0x39d4, 0x39ce, 0x39ce, + 0x39ce, 0x39ce, 0x39c7, 0x39c7, 0x39c7, 0x39c7, 0x39c1, 0x39c1, 0x39c1, 0x39c1, 0x39bb, 0x39bb, + 0x39bb, 0x39bb, 0x39b5, 0x39b5, 0x39b5, 0x39b5, 0x39b0, 0x39b0, +}; + +// FIXME: not hard code +// contribute from hw, fix with `PRE_DATA` input +static double sigmode_golden[] = { + 0.5, 0.501999989, 0.503999915, 0.505999712, 0.507999317, 0.509998667, 0.511997697, + 0.513996342, 0.515994541, 0.517992228, 0.51998934, 0.521985814, 0.523981585, 0.525976591, + 0.527970767, 0.529964052, 0.531956381, 0.533947691, 0.535937921, 0.537927006, 0.539914885, + 0.541901494, 0.543886772, 0.545870657, 0.547853086, 0.549833997, 0.55181333, 0.553791023, + 0.555767014, 0.557741243, 0.559713649, 0.561684172, 0.56365275, 0.565619325, 0.567583836, + 0.569546224, 0.571506429, 0.573464394, 0.575420058, 0.577373363, 0.579324252, 0.581272667, + 0.583218549, 0.585161842, 0.58710249, 0.589040434, 0.59097562, 0.59290799, 0.594837491, + 0.596764066, 0.59868766, 0.60060822, 0.60252569, 0.604440017, 0.606351149, 0.608259031, + 0.610163611, 0.612064837, 0.613962657, 0.61585702, 0.617747875, 0.61963517, 0.621518857, + 0.623398885, 0.625275204, 0.627147766, 0.629016523, 0.630881426, 0.632742428, 0.634599482, + 0.63645254, 0.638301558, 0.640146488, 0.641987286, 0.643823907, 0.645656306, 0.64748444, + 0.649308265, 0.651127739, 0.652942818, 0.654753461, 0.656559626, 0.658361272, 0.66015836, + 0.661950848, 0.663738697, 0.665521869, 0.667300325, 0.669074026, 0.670842936, 0.672607017, + 0.674366233, 0.676120548, 0.677869926, 0.679614333, 0.681353734, 0.683088095, 0.684817383, + 0.686541565, 0.688260608, 0.689974481, 0.691683153, 0.693386592, 0.695084769, 0.696777653, + 0.698465216, 0.700147429, 0.701824263, 0.703495691, 0.705161686, 0.706822221, 0.70847727, + 0.710126808, 0.71177081, 0.71340925, 0.715042106, 0.716669353, 0.718290968, 0.71990693, + 0.721517216, 0.723121805, 0.724720676, 0.726313808, 0.727901182, 0.729482779, 0.731058579, + 0.732628564, 0.734192716, 0.735751018, 0.737303454, 0.738850006, 0.740390659, 0.741925398, + 0.743454208, 0.744977074, 0.746493983, 0.748004922, 0.749509876, 0.751008835, 0.752501785, + 0.753988716, 0.755469617, 0.756944477, 0.758413287, 0.759876035, 0.761332715, 0.762783316, + 0.764227831, 0.765666252, 0.767098572, 0.768524783, 0.769944881, 0.771358858, 0.772766709, + 0.774168429, 0.775564014, 0.77695346, 0.778336762, 0.779713917, 0.781084923, 0.782449776, + 0.783808476, 0.78516102, 0.786507407, 0.787847636, 0.789181707, 0.790509619, 0.791831373, + 0.79314697, 0.794456411, 0.795759698, 0.797056831, 0.798347814, 0.79963265, 0.80091134, + 0.802183889, 0.803450299, 0.804710577, 0.805964724, 0.807212748, 0.808454651, 0.809690441, + 0.810920123, 0.812143702, 0.813361186, 0.814572581, 0.815777894, 0.816977132, 0.818170304, + 0.819357418, 0.820538481, 0.821713502, 0.82288249, 0.824045455, 0.825202406, 0.826353353, + 0.827498306, 0.828637274, 0.82977027, 0.830897303, 0.832018385, 0.833133528, 0.834242742, + 0.83534604, 0.836443435, 0.837534937, 0.838620561, 0.83970032, 0.840774225, 0.841842291, + 0.842904531, 0.843960959, 0.84501159, 0.846056436, 0.847095514, 0.848128836, 0.84915642, + 0.850178278, 0.851194427, 0.852204883, 0.85320966, 0.854208775, 0.855202244, 0.856190082, + 0.857172307, 0.858148935, 0.859119982, 0.860085466, 0.861045403, 0.861999811, 0.862948707, + 0.863892109, 0.864830034, 0.8657625, 0.866689525, 0.867611126, 0.868527324, 0.869438134, + 0.870343577, 0.871243671, 0.872138434, 0.873027885, 0.873912043, 0.874790928, 0.875664558, + 0.876532952, 0.877396131, 0.878254114, 0.879106919, 0.879954567, 0.880797078, 0.881634471, + 0.882466767, 0.883293985, 0.884116145, 0.884933268, 0.885745374, 0.886552483, 0.887354615, + 0.888151792, 0.888944033, 0.88973136, 0.890513792, 0.89129135, 0.892064056, 0.89283193, + 0.893594992, 0.894353264, 0.895106767, 0.895855521, 0.896599549, 0.897338869, 0.898073505, + 0.898803476, 0.899528804, 0.900249511, 0.900965617, 0.901677143, 0.902384111, 0.903086543, + 0.903784458, 0.90447788, 0.905166828, 0.905851324, 0.90653139, 0.907207047, 0.907878316, + 0.908545218, 0.909207776, 0.90986601, 0.910519941, 0.911169591, 0.911814981, 0.912456133, + 0.913093067, 0.913725806, 0.914354369, 0.91497878, 0.915599058, 0.916215226, 0.916827304, + 0.917435313, 0.918039275, 0.91863921, 0.919235141, 0.919827088, 0.920415072, 0.920999114, + 0.921579235, 0.922155456, 0.922727798, 0.923296282, 0.923860929, 0.92442176, 0.924978795, + 0.925532055, 0.926081561, 0.926627334, 0.927169394, 0.927707762, 0.928242458, 0.928773503, + 0.929300917, 0.929824721, 0.930344935, 0.93086158, 0.931374675, 0.931884241, 0.932390297, + 0.932892865, 0.933391964, 0.933887615, 0.934379836, 0.934868648, 0.93535407, 0.935836124, + 0.936314827, 0.9367902, 0.937262263, 0.937731034, 0.938196534, 0.938658781, 0.939117796, + 0.939573597, 0.940026203, 0.940475634, 0.940921909, 0.941365046, 0.941805065, 0.942241985, + 0.942675824, 0.943106601, 0.943534335, 0.943959044, 0.944380747, 0.944799462, 0.945215208, + 0.945628003, 0.946037865, 0.946444813, 0.946848864, 0.947250036, 0.947648348, 0.948043817, + 0.948436462, 0.948826299, 0.949213347, 0.949597623, 0.949979144, 0.950357929, 0.950733994, + 0.951107357, 0.951478034, 0.951846044, 0.952211402, 0.952574127, 0.952934234, 0.953291742, + 0.953646665, 0.953999022, 0.954348829, 0.954696102, 0.955040858, 0.955383113, 0.955722883, + 0.956060185, 0.956395034, 0.956727447, 0.95705744, 0.957385028, 0.957710228, 0.958033055, + 0.958353525, 0.958671653, 0.958987455, 0.959300946, 0.959612142, 0.959921058, 0.960227709, + 0.960532111, 0.960834277, 0.961134224, 0.961431966, 0.961727518, 0.962020894, 0.962312109, + 0.962601179, 0.962888117, 0.963172937, 0.963455655, 0.963736284, 0.964014838, 0.964291332, + 0.96456578, 0.964838195, 0.965108591, 0.965376983, 0.965643384, 0.965907808, 0.966170267, + 0.966430777, 0.966689349, 0.966945998, 0.967200737, 0.967453578, 0.967704535, 0.967953622, + 0.96820085, 0.968446233, 0.968689784, 0.968931516, 0.96917144, 0.969409571, 0.969645919, + 0.969880498, 0.97011332, 0.970344398, 0.970573743, 0.970801367, 0.971027284, 0.971251504, + 0.97147404, 0.971694904, 0.971914107, 0.972131661, 0.972347578, 0.972561869, 0.972774546, + 0.97298562, 0.973195103, 0.973403006, 0.973609341, 0.973814117, 0.974017347, 0.974219042, + 0.974419212, 0.974617868, 0.974815021, 0.975010683, 0.975204863, 0.975397572, 0.97558882, + 0.975778619, 0.975966979, 0.97615391, 0.976339422, 0.976523525, 0.97670623, 0.976887547, + 0.977067486, 0.977246057, 0.977423269, 0.977599132, 0.977773657, 0.977946853, 0.978118729, + 0.978289296, 0.978458562, 0.978626537, 0.978793231, 0.978958653, 0.979122812, 0.979285717, + 0.979447378, 0.979607804, 0.979767003, 0.979924985, 0.980081758, 0.980237332, 0.980391715, + 0.980544915, 0.980696943, 0.980847805, 0.980997512, 0.981146071, 0.98129349, 0.981439779, + 0.981584945, 0.981728996, 0.981871942, 0.98201379, 0.982154548, 0.982294225, 0.982432827, + 0.982570364, 0.982706843, 0.982842273, 0.982976659, 0.983110012, 0.983242337, 0.983373644, + 0.983503939, 0.983633229, 0.983761524, 0.983888829, 0.984015152, 0.9841405, 0.984264882, + 0.984388303, 0.984510772, 0.984632294, 0.984752879, 0.984872531, 0.984991259, 0.985109069, + 0.985225968, 0.985341963, 0.985457061, 0.985571269, 0.985684592, 0.985797039, 0.985908614, + 0.986019326, 0.98612918, 0.986238183, 0.986346341, 0.986453661, 0.986560148, 0.98666581, + 0.986770653, 0.986874682, 0.986977903, 0.987080324, 0.98718195, 0.987282786, 0.987382839, + 0.987482115, 0.98758062, 0.98767836, 0.987775339, 0.987871565, 0.987967043, 0.988061778, + 0.988155776, 0.988249042, 0.988341583, 0.988433404, 0.98852451, 0.988614907, 0.9887046, + 0.988793594, 0.988881895, 0.988969507, 0.989056437, 0.98914269, 0.98922827, 0.989313183, + 0.989397433, 0.989481027, 0.989563968, 0.989646262, 0.989727914, 0.989808929, 0.989889312, + 0.989969066, 0.990048198, 0.990126712, 0.990204613, 0.990281905, 0.990358593, 0.990434681, + 0.990510175, 0.990585079, 0.990659397, 0.990733134, 0.990806295, 0.990878883, 0.990950903, + 0.99102236, 0.991093257, 0.9911636, 0.991233391, 0.991302637, 0.99137134, 0.991439506, + 0.991507137, 0.991574239, 0.991640815, 0.991706869, 0.991772406, 0.991837429, 0.991901942, + 0.99196595, 0.992029456, 0.992092463, 0.992154977, 0.992217, 0.992278537, 0.992339591, + 0.992400166, 0.992460265, 0.992519893, 0.992579053, 0.992637749, 0.992695983, 0.99275376, + 0.992811084, 0.992867957, 0.992924384, 0.992980367, 0.993035911, 0.993091018, 0.993145692, + 0.993199936, 0.993253754, 0.993307149, 0.993360124, 0.993412683, 0.993464828, 0.993516563, + 0.993567892, 0.993618816, 0.99366934, 0.993719466, 0.993769198, 0.993818539, 0.993867491, + 0.993916059, 0.993964243, 0.994012049, 0.994059478, 0.994106533, 0.994153219, 0.994199536, + 0.994245489, 0.994291079, 0.994336311, 0.994381186, 0.994425708, 0.994469878, 0.994513701, + 0.994557178, 0.994600313, 0.994643108, 0.994685565, 0.994727688, 0.994769478, 0.994810939, + 0.994852073, 0.994892883, 0.994933371, 0.994973539, 0.995013391, 0.995052928, 0.995092153, + 0.995131069, 0.995169677, 0.995207981, 0.995245983, 0.995283685, 0.995321089, 0.995358198, + 0.995395014, 0.995431539, 0.995467776, 0.995503727, 0.995539394, 0.995574779, 0.995609885, + 0.995644713, 0.995679266, 0.995713547, 0.995747556, 0.995781297, 0.995814772, 0.995847981, + 0.995880929, 0.995913616, 0.995946044, 0.995978217, 0.996010135, 0.996041801, 0.996073216, + 0.996104383, 0.996135304, 0.99616598, 0.996196413, 0.996226606, 0.996256561, 0.996286278, + 0.99631576, 0.996345009, 0.996374027, 0.996402815, 0.996431375, 0.99645971, 0.99648782, + 0.996515708, 0.996543375, 0.996570823, 0.996598054, 0.99662507, 0.996651872, 0.996678461, + 0.99670484, 0.99673101, 0.996756974, 0.996782731, 0.996808285, 0.996833636, 0.996858787, + 0.996883738, 0.996908492, 0.99693305, 0.996957413, 0.996981584, 0.997005563, 0.997029352, + 0.997052952, 0.997076366, 0.997099594, 0.997122638, 0.9971455, 0.99716818, 0.997190681, + 0.997213004, 0.997235149, 0.99725712, 0.997278916, 0.997300539, 0.997321991, 0.997343273, + 0.997364386, 0.997385332, 0.997406112, 0.997426727, 0.997447179, 0.997467468, 0.997487597, + 0.997507566, 0.997527377, 0.997547031, 0.997566528, 0.997585872, 0.997605062, 0.997624099, + 0.997642986, 0.997661723, 0.997680312, 0.997698752, 0.997717047, 0.997735197, 0.997753202, + 0.997771065, 0.997788786, 0.997806367, 0.997823808, 0.99784111, 0.997858276, 0.997875305, + 0.997892199, 0.997908959, 0.997925586, 0.997942081, 0.997958445, 0.99797468, 0.997990785, + 0.998006763, 0.998022614, 0.998038339, 0.998053939, 0.998069415, 0.998084769, 0.998100001, + 0.998115112, 0.998130102, 0.998144974, 0.998159728, 0.998174365, 0.998188885, 0.99820329, + 0.998217581, 0.998231759, 0.998245823, 0.998259777, 0.998273619, 0.998287351, 0.998300975, + 0.99831449, 0.998327898, 0.998341199, 0.998354395, 0.998367486, 0.998380473, 0.998393356, + 0.998406138, 0.998418818, 0.998431397, 0.998443876, 0.998456256, 0.998468538, 0.998480723, + 0.99849281, 0.998504802, 0.998516698, 0.998528499, 0.998540207, 0.998551822, 0.998563345, + 0.998574776, 0.998586116, 0.998597366, 0.998608527, 0.998619599, 0.998630583, 0.99864148, + 0.99865229, 0.998663015, 0.998673654, 0.998684208, 0.998694679, 0.998705066, 0.998715371, + 0.998725594, 0.998735736, 0.998745797, 0.998755778, 0.99876568, 0.998775503, 0.998785248, + 0.998794916, 0.998804507, 0.998814021, 0.99882346, 0.998832824, 0.998842113, 0.998851329, + 0.998860471, 0.998869541, 0.998878538, 0.998887464, 0.998896319, 0.998905104, 0.998913818, + 0.998922464, 0.99893104, 0.998939549, 0.99894799, 0.998956364, 0.998964671, 0.998972912, + 0.998981088, 0.998989198, 0.998997244, 0.999005226, 0.999013145, 0.999021001, 0.999028794, + 0.999036525, 0.999044195, 0.999051803, 0.999059352, 0.99906684, 0.999074268, 0.999081638, + 0.999088949, 0.999096202, 0.999103397, 0.999110535, 0.999117616, 0.99912464, 0.999131609, + 0.999138523, 0.999145381, 0.999152185, 0.999158935, 0.999165631, 0.999172274, 0.999178864, + 0.999185401, 0.999191887, 0.999198321, 0.999204704, 0.999211036, 0.999217317, 0.999223549, + 0.999229731, 0.999235864, 0.999241948, 0.999247984, 0.999253971, 0.999259911, 0.999265804, + 0.99927165, 0.999277449, 0.999283202, 0.99928891, 0.999294572, 0.999300189, 0.999305761, + 0.999311289, 0.999316773, 0.999322213, 0.99932761, 0.999332964, 0.999338276, 0.999343545, + 0.999348772, 0.999353958, 0.999359103, 0.999364206, 0.999369269, 0.999374291, 0.999379274, + 0.999384217, 0.999389121, 0.999393985, 0.999398811, 0.999403599, 0.999408348, 0.99941306, + 0.999417734, 0.99942237, 0.99942697, 0.999431534, 0.999436061, 0.999440552, 0.999445007, + 0.999449427, 0.999453811, 0.999458161, 0.999462476, 0.999466757, 0.999471004, 0.999475217, + 0.999479396, 0.999483542, 0.999487655, 0.999491735, 0.999495783, 0.999499799, 0.999503783, + 0.999507735, 0.999511655, 0.999515544, 0.999519403, 0.99952323, 0.999527027, 0.999530794, + 0.999534531, 0.999538238, 0.999541916, 0.999545564, 0.999549184, 0.999552774, 0.999556336, + 0.99955987, 0.999563375, 0.999566853, 0.999570303, 0.999573725, 0.99957712, 0.999580488, + 0.99958383, 0.999587145, 0.999590433, 0.999593695, 0.999596931, 0.999600142, 0.999603326, + 0.999606486, 0.99960962, 0.99961273, 0.999615814, 0.999618874, 0.99962191, 0.999624921, + 0.999627909, 0.999630873, 0.999633813, 0.99963673, 0.999639623, 0.999642494, 0.999645341, + 0.999648166, 0.999650969, 0.999653749, 0.999656507, 0.999659243, 0.999661957, 0.498000011, + 0.496000085, 0.494000288, 0.492000683, 0.490001333, 0.488002303, 0.486003658, 0.484005459, + 0.482007772, 0.48001066, 0.478014186, 0.476018415, 0.474023409, 0.472029233, 0.470035948, + 0.468043619, 0.466052309, 0.464062079, 0.462072994, 0.460085115, 0.458098506, 0.456113228, + 0.454129343, 0.452146914, 0.450166003, 0.44818667, 0.446208977, 0.444232986, 0.442258757, + 0.440286351, 0.438315828, 0.43634725, 0.434380675, 0.432416164, 0.430453776, 0.428493571, + 0.426535606, 0.424579942, 0.422626637, 0.420675748, 0.418727333, 0.416781451, 0.414838158, + 0.41289751, 0.410959566, 0.40902438, 0.40709201, 0.405162509, 0.403235934, 0.40131234, + 0.39939178, 0.39747431, 0.395559983, 0.393648851, 0.391740969, 0.389836389, 0.387935163, + 0.386037343, 0.38414298, 0.382252125, 0.38036483, 0.378481143, 0.376601115, 0.374724796, + 0.372852234, 0.370983477, 0.369118574, 0.367257572, 0.365400518, 0.36354746, 0.361698442, + 0.359853512, 0.358012714, 0.356176093, 0.354343694, 0.35251556, 0.350691735, 0.348872261, + 0.347057182, 0.345246539, 0.343440374, 0.341638728, 0.33984164, 0.338049152, 0.336261303, + 0.334478131, 0.332699675, 0.330925974, 0.329157064, 0.327392983, 0.325633767, 0.323879452, + 0.322130074, 0.320385667, 0.318646266, 0.316911905, 0.315182617, 0.313458435, 0.311739392, + 0.310025519, 0.308316847, 0.306613408, 0.304915231, 0.303222347, 0.301534784, 0.299852571, + 0.298175737, 0.296504309, 0.294838314, 0.293177779, 0.29152273, 0.289873192, 0.28822919, + 0.28659075, 0.284957894, 0.283330647, 0.281709032, 0.28009307, 0.278482784, 0.276878195, + 0.275279324, 0.273686192, 0.272098818, 0.270517221, 0.268941421, 0.267371436, 0.265807284, + 0.264248982, 0.262696546, 0.261149994, 0.259609341, 0.258074602, 0.256545792, 0.255022926, + 0.253506017, 0.251995078, 0.250490124, 0.248991165, 0.247498215, 0.246011284, 0.244530383, + 0.243055523, 0.241586713, 0.240123965, 0.238667285, 0.237216684, 0.235772169, 0.234333748, + 0.232901428, 0.231475217, 0.230055119, 0.228641142, 0.227233291, 0.225831571, 0.224435986, + 0.22304654, 0.221663238, 0.220286083, 0.218915077, 0.217550224, 0.216191524, 0.21483898, + 0.213492593, 0.212152364, 0.210818293, 0.209490381, 0.208168627, 0.20685303, 0.205543589, + 0.204240302, 0.202943169, 0.201652186, 0.20036735, 0.19908866, 0.197816111, 0.196549701, + 0.195289423, 0.194035276, 0.192787252, 0.191545349, 0.190309559, 0.189079877, 0.187856298, + 0.186638814, 0.185427419, 0.184222106, 0.183022868, 0.181829696, 0.180642582, 0.179461519, + 0.178286498, 0.17711751, 0.175954545, 0.174797594, 0.173646647, 0.172501694, 0.171362726, + 0.17022973, 0.169102697, 0.167981615, 0.166866472, 0.165757258, 0.16465396, 0.163556565, + 0.162465063, 0.161379439, 0.16029968, 0.159225775, 0.158157709, 0.157095469, 0.156039041, + 0.15498841, 0.153943564, 0.152904486, 0.151871164, 0.15084358, 0.149821722, 0.148805573, + 0.147795117, 0.14679034, 0.145791225, 0.144797756, 0.143809918, 0.142827693, 0.141851065, + 0.140880018, 0.139914534, 0.138954597, 0.138000189, 0.137051293, 0.136107891, 0.135169966, + 0.1342375, 0.133310475, 0.132388874, 0.131472676, 0.130561866, 0.129656423, 0.128756329, + 0.127861566, 0.126972115, 0.126087957, 0.125209072, 0.124335442, 0.123467048, 0.122603869, + 0.121745886, 0.120893081, 0.120045433, 0.119202922, 0.118365529, 0.117533233, 0.116706015, + 0.115883855, 0.115066732, 0.114254626, 0.113447517, 0.112645385, 0.111848208, 0.111055967, + 0.11026864, 0.109486208, 0.10870865, 0.107935944, 0.10716807, 0.106405008, 0.105646736, + 0.104893233, 0.104144479, 0.103400451, 0.102661131, 0.101926495, 0.101196524, 0.100471196, + 0.099750489, 0.099034383, 0.098322857, 0.097615889, 0.096913457, 0.096215542, 0.09552212, + 0.094833172, 0.094148676, 0.09346861, 0.092792953, 0.092121684, 0.091454782, 0.090792224, + 0.09013399, 0.089480059, 0.088830409, 0.088185019, 0.087543867, 0.086906933, 0.086274194, + 0.085645631, 0.08502122, 0.084400942, 0.083784774, 0.083172696, 0.082564687, 0.081960725, + 0.08136079, 0.080764859, 0.080172912, 0.079584928, 0.079000886, 0.078420765, 0.077844544, + 0.077272202, 0.076703718, 0.076139071, 0.07557824, 0.075021205, 0.074467945, 0.073918439, + 0.073372666, 0.072830606, 0.072292238, 0.071757542, 0.071226497, 0.070699083, 0.070175279, + 0.069655065, 0.06913842, 0.068625325, 0.068115759, 0.067609703, 0.067107135, 0.066608036, + 0.066112385, 0.065620164, 0.065131352, 0.06464593, 0.064163876, 0.063685173, 0.0632098, + 0.062737737, 0.062268966, 0.061803466, 0.061341219, 0.060882204, 0.060426403, 0.059973797, + 0.059524366, 0.059078091, 0.058634954, 0.058194935, 0.057758015, 0.057324176, 0.056893399, + 0.056465665, 0.056040956, 0.055619253, 0.055200538, 0.054784792, 0.054371997, 0.053962135, + 0.053555187, 0.053151136, 0.052749964, 0.052351652, 0.051956183, 0.051563538, 0.051173701, + 0.050786653, 0.050402377, 0.050020856, 0.049642071, 0.049266006, 0.048892643, 0.048521966, + 0.048153956, 0.047788598, 0.047425873, 0.047065766, 0.046708258, 0.046353335, 0.046000978, + 0.045651171, 0.045303898, 0.044959142, 0.044616887, 0.044277117, 0.043939815, 0.043604966, + 0.043272553, 0.04294256, 0.042614972, 0.042289772, 0.041966945, 0.041646475, 0.041328347, + 0.041012545, 0.040699054, 0.040387858, 0.040078942, 0.039772291, 0.039467889, 0.039165723, + 0.038865776, 0.038568034, 0.038272482, 0.037979106, 0.037687891, 0.037398821, 0.037111883, + 0.036827063, 0.036544345, 0.036263716, 0.035985162, 0.035708668, 0.03543422, 0.035161805, + 0.034891409, 0.034623017, 0.034356616, 0.034092192, 0.033829733, 0.033569223, 0.033310651, + 0.033054002, 0.032799263, 0.032546422, 0.032295465, 0.032046378, 0.03179915, 0.031553767, + 0.031310216, 0.031068484, 0.03082856, 0.030590429, 0.030354081, 0.030119502, 0.02988668, + 0.029655602, 0.029426257, 0.029198633, 0.028972716, 0.028748496, 0.02852596, 0.028305096, + 0.028085893, 0.027868339, 0.027652422, 0.027438131, 0.027225454, 0.02701438, 0.026804897, + 0.026596994, 0.026390659, 0.026185883, 0.025982653, 0.025780958, 0.025580788, 0.025382132, + 0.025184979, 0.024989317, 0.024795137, 0.024602428, 0.02441118, 0.024221381, 0.024033021, + 0.02384609, 0.023660578, 0.023476475, 0.02329377, 0.023112453, 0.022932514, 0.022753943, + 0.022576731, 0.022400868, 0.022226343, 0.022053147, 0.021881271, 0.021710704, 0.021541438, + 0.021373463, 0.021206769, 0.021041347, 0.020877188, 0.020714283, 0.020552622, 0.020392196, + 0.020232997, 0.020075015, 0.019918242, 0.019762668, 0.019608285, 0.019455085, 0.019303057, + 0.019152195, 0.019002488, 0.018853929, 0.01870651, 0.018560221, 0.018415055, 0.018271004, + 0.018128058, 0.01798621, 0.017845452, 0.017705775, 0.017567173, 0.017429636, 0.017293157, + 0.017157727, 0.017023341, 0.016889988, 0.016757663, 0.016626356, 0.016496061, 0.016366771, + 0.016238476, 0.016111171, 0.015984848, 0.0158595, 0.015735118, 0.015611697, 0.015489228, + 0.015367706, 0.015247121, 0.015127469, 0.015008741, 0.014890931, 0.014774032, 0.014658037, + 0.014542939, 0.014428731, 0.014315408, 0.014202961, 0.014091386, 0.013980674, 0.01387082, + 0.013761817, 0.013653659, 0.013546339, 0.013439852, 0.01333419, 0.013229347, 0.013125318, + 0.013022097, 0.012919676, 0.01281805, 0.012717214, 0.012617161, 0.012517885, 0.01241938, + 0.01232164, 0.012224661, 0.012128435, 0.012032957, 0.011938222, 0.011844224, 0.011750958, + 0.011658417, 0.011566596, 0.01147549, 0.011385093, 0.0112954, 0.011206406, 0.011118105, + 0.011030493, 0.010943563, 0.01085731, 0.01077173, 0.010686817, 0.010602567, 0.010518973, + 0.010436032, 0.010353738, 0.010272086, 0.010191071, 0.010110688, 0.010030934, 0.009951802, + 0.009873288, 0.009795387, 0.009718095, 0.009641407, 0.009565319, 0.009489825, 0.009414921, + 0.009340603, 0.009266866, 0.009193705, 0.009121117, 0.009049097, 0.00897764, 0.008906743, + 0.0088364, 0.008766609, 0.008697363, 0.00862866, 0.008560494, 0.008492863, 0.008425761, + 0.008359185, 0.008293131, 0.008227594, 0.008162571, 0.008098058, 0.00803405, 0.007970544, + 0.007907537, 0.007845023, 0.007783, 0.007721463, 0.007660409, 0.007599834, 0.007539735, + 0.007480107, 0.007420947, 0.007362251, 0.007304017, 0.00724624, 0.007188916, 0.007132043, + 0.007075616, 0.007019633, 0.006964089, 0.006908982, 0.006854308, 0.006800064, 0.006746246, + 0.006692851, 0.006639876, 0.006587317, 0.006535172, 0.006483437, 0.006432108, 0.006381184, + 0.00633066, 0.006280534, 0.006230802, 0.006181461, 0.006132509, 0.006083941, 0.006035757, + 0.005987951, 0.005940522, 0.005893467, 0.005846781, 0.005800464, 0.005754511, 0.005708921, + 0.005663689, 0.005618814, 0.005574292, 0.005530122, 0.005486299, 0.005442822, 0.005399687, + 0.005356892, 0.005314435, 0.005272312, 0.005230522, 0.005189061, 0.005147927, 0.005107117, + 0.005066629, 0.005026461, 0.004986609, 0.004947072, 0.004907847, 0.004868931, 0.004830323, + 0.004792019, 0.004754017, 0.004716315, 0.004678911, 0.004641802, 0.004604986, 0.004568461, + 0.004532224, 0.004496273, 0.004460606, 0.004425221, 0.004390115, 0.004355287, 0.004320734, + 0.004286453, 0.004252444, 0.004218703, 0.004185228, 0.004152019, 0.004119071, 0.004086384, + 0.004053956, 0.004021783, 0.003989865, 0.003958199, 0.003926784, 0.003895617, 0.003864696, + 0.00383402, 0.003803587, 0.003773394, 0.003743439, 0.003713722, 0.00368424, 0.003654991, + 0.003625973, 0.003597185, 0.003568625, 0.00354029, 0.00351218, 0.003484292, 0.003456625, + 0.003429177, 0.003401946, 0.00337493, 0.003348128, 0.003321539, 0.00329516, 0.00326899, + 0.003243026, 0.003217269, 0.003191715, 0.003166364, 0.003141213, 0.003116262, 0.003091508, + 0.00306695, 0.003042587, 0.003018416, 0.002994437, 0.002970648, 0.002947048, 0.002923634, + 0.002900406, 0.002877362, 0.0028545, 0.00283182, 0.002809319, 0.002786996, 0.002764851, + 0.00274288, 0.002721084, 0.002699461, 0.002678009, 0.002656727, 0.002635614, 0.002614668, + 0.002593888, 0.002573273, 0.002552821, 0.002532532, 0.002512403, 0.002492434, 0.002472623, + 0.002452969, 0.002433472, 0.002414128, 0.002394938, 0.002375901, 0.002357014, 0.002338277, + 0.002319688, 0.002301248, 0.002282953, 0.002264803, 0.002246798, 0.002228935, 0.002211214, + 0.002193633, 0.002176192, 0.00215889, 0.002141724, 0.002124695, 0.002107801, 0.002091041, + 0.002074414, 0.002057919, 0.002041555, 0.00202532, 0.002009215, 0.001993237, 0.001977386, + 0.001961661, 0.001946061, 0.001930585, 0.001915231, 0.001899999, 0.001884888, 0.001869898, + 0.001855026, 0.001840272, 0.001825635, 0.001811115, 0.00179671, 0.001782419, 0.001768241, + 0.001754177, 0.001740223, 0.001726381, 0.001712649, 0.001699025, 0.00168551, 0.001672102, + 0.001658801, 0.001645605, 0.001632514, 0.001619527, 0.001606644, 0.001593862, 0.001581182, + 0.001568603, 0.001556124, 0.001543744, 0.001531462, 0.001519277, 0.00150719, 0.001495198, + 0.001483302, 0.001471501, 0.001459793, 0.001448178, 0.001436655, 0.001425224, 0.001413884, + 0.001402634, 0.001391473, 0.001380401, 0.001369417, 0.00135852, 0.00134771, 0.001336985, + 0.001326346, 0.001315792, 0.001305321, 0.001294934, 0.001284629, 0.001274406, 0.001264264, + 0.001254203, 0.001244222, 0.00123432, 0.001224497, 0.001214752, 0.001205084, 0.001195493, + 0.001185979, 0.00117654, 0.001167176, 0.001157887, 0.001148671, 0.001139529, 0.001130459, + 0.001121462, 0.001112536, 0.001103681, 0.001094896, 0.001086182, 0.001077536, 0.00106896, + 0.001060451, 0.00105201, 0.001043636, 0.001035329, 0.001027088, 0.001018912, 0.001010802, + 0.001002756, 0.000994774, 0.000986855, 0.000978999, 0.000971206, 0.000963475, 0.000955805, + 0.000948197, 0.000940648, 0.00093316, 0.000925732, 0.000918362, 0.000911051, 0.000903798, + 0.000896603, 0.000889465, 0.000882384, 0.00087536, 0.000868391, 0.000861477, 0.000854619, + 0.000847815, 0.000841065, 0.000834369, 0.000827726, 0.000821136, 0.000814599, 0.000808113, + 0.000801679, 0.000795296, 0.000788964, 0.000782683, 0.000776451, 0.000770269, 0.000764136, + 0.000758052, 0.000752016, 0.000746029, 0.000740089, 0.000734196, 0.00072835, 0.000722551, + 0.000716798, 0.00071109, 0.000705428, 0.000699811, 0.000694239, 0.000688711, 0.000683227, + 0.000677787, 0.00067239, 0.000667036, 0.000661724, 0.000656455, 0.000651228, 0.000646042, + 0.000640897, 0.000635794, 0.000630731, 0.000625709, 0.000620726, 0.000615783, 0.000610879, + 0.000606015, 0.000601189, 0.000596401, 0.000591652, 0.00058694, 0.000582266, 0.00057763, + 0.00057303, 0.000568466, 0.000563939, 0.000559448, 0.000554993, 0.000550573, 0.000546189, + 0.000541839, 0.000537524, 0.000533243, 0.000528996, 0.000524783, 0.000520604, 0.000516458, + 0.000512345, 0.000508265, 0.000504217, 0.000500201, 0.000496217, 0.000492265, 0.000488345, + 0.000484456, 0.000480597, 0.00047677, 0.000472973, 0.000469206, 0.000465469, 0.000461762, + 0.000458084, 0.000454436, 0.000450816, 0.000447226, 0.000443664, 0.00044013, 0.000436625, + 0.000433147, 0.000429697, 0.000426275, 0.00042288, 0.000419512, 0.00041617, 0.000412855, + 0.000409567, 0.000406305, 0.000403069, 0.000399858, 0.000396674, 0.000393514, 0.00039038, + 0.00038727, 0.000384186, 0.000381126, 0.00037809, 0.000375079, 0.000372091, 0.000369127, + 0.000366187, 0.00036327, 0.000360377, 0.000357506, 0.000354659, 0.000351834, 0.000349031, + 0.000346251, 0.000343493, 0.000340757, 0.000338043, 0.00033535}; + +// static bool check_input_int8_range(float input) +//{ +// bool ret = input > -128.0 && input < 128.0; +// if (!ret) { +// printf("invalid int8 range, input is %f\n", input); +// } +// return ret; +//} + +static double _gen_sigmoid(float x) { return 1.0 / (1.0 + exp(-(x))); } + +static void tl_lut_ref(uint16_t *ofmap, uint16_t *ifmap, uint16_t *table, uint16_t *table_slope, + cvk_tl_shape_t ifmap_shape, cvk_tl_shape_t table_shape, int range_start, + int range_end) { + int tn, th, tw; + + tn = table_shape.n; + th = table_shape.h; + tw = table_shape.w; + (void)tn; + (void)th; + (void)tw; + (void)table; + (void)table_slope; + (void)range_start; + (void)range_end; + assert(tn == 1); + assert(th * tw == 256); + assert(table); + assert(table_slope); + assert(ifmap_shape.n); + assert(ifmap); + assert(ofmap); + + // TODO: use c function + // 1. dump all input as binary file +#ifdef GDB +#define INFP32FILE "infp32file.bin" +#define OUTBF16FILE "lutbf16out.bin" + FILE *pFile; + pFile = fopen(INFP32FILE, "wb"); + int shape_sz = tl_shape_size(&ifmap_shape); + float *f = new float[shape_sz]; + for (int i = 0; i < shape_sz; i++) { + f[i] = convert_bf16_fp32(ifmap[i]); + } + fwrite(f, 1, shape_sz * sizeof(float), pFile); + fclose(pFile); + + // 2. read result from `eval_lut.py` + char command[256]; + sprintf(command, + "python eval_lut.py --lut_input_range_start %d --lut_input_range_end " + "%d --inputfloat32 %s --outputbf16 %s 2>&1 > 2\n", + range_start, range_end, INFP32FILE, OUTBF16FILE); + + int r; + r = system(command); + printf("command is %s, return %d\n", command, r); + assert(r != 0); + + pFile = fopen(OUTBF16FILE, "rb"); + if (!pFile) { + fprintf(stderr, "open golden %s fail\n", OUTBF16FILE); + exit(-1); + } + + size_t file_length; + file_length = fread(ofmap, sizeof(uint16_t), tl_shape_size(&ifmap_shape), pFile); + printf("read from golden, file size %lu\n", file_length); + fclose(pFile); +#else + assert(range_start); + assert(range_end); + for (uint64_t i = 0; i < tl_shape_size(&ifmap_shape); i++) { + ofmap[i] = convert_fp32_bf16(_gen_sigmoid(convert_bf16_fp32(ifmap[i]))); + } +#endif + +#ifdef GDB + for (uint64_t i = 0; i < tl_shape_size(&ifmap_shape); i++) { + printf("ref %lu input 0x%x(%f) golden 0x%x(%f)\n", i, ifmap[i], convert_bf16_fp32(ifmap[i]), + ofmap[i], convert_bf16_fp32(ofmap[i])); + } +#endif +} + +static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint64_t ofmap_size) { + int count = 0; + uint64_t size = ofmap_size; + if (mode == PRE_DATA_COMPARE_FIX) { + size = sizeof(sigmode_golden_bf16) / sizeof(uint16_t); + } else if (PRE_DATA_MAX_ERROR) { + size = sizeof(sigmode_golden) / sizeof(double); + } + + for (uint64_t i = 0; i < size; i++) { + if (mode == PRE_DATA_COMPARE_FIX) { + if (ofmap_data[i] != sigmode_golden_bf16[i]) { + fprintf(stderr, "[%d] comparing failed at ofmap_data[%lu], got %x, exp %x\n", count, i, + ofmap_data[i], sigmode_golden_bf16[i]); + exit(-1); + } + } else { + float got = convert_bf16_fp32(ofmap_data[i]); + float exp = convert_bf16_fp32(ref_data[i]); + + if (mode == PRE_DATA_MAX_ERROR) { + // cus we have better accuracy ~ 0.0039 + exp = sigmode_golden[i]; + } + + if (fabs(got - exp) > MAX_ERROR) { + fprintf(stderr, + "[%d] comparing failed at ofmap_data[%lu], got %x, exp %x, " + "diff(%f - %f) is %f\n", + count, i, ofmap_data[i], ref_data[i], got, exp, fabs(got - exp)); + count++; + } + } + } + + if (count != 0) { + printf("error count is %d\n", count); + exit(-1); + } + + return true; +} + +static void gen_input(uint16_t *ifmap, uint64_t ifmap_size) { + if (mode == PRE_DATA_COMPARE_FIX || mode == PRE_DATA_MAX_ERROR) { + memcpy(ifmap, &test_pattern, sizeof(test_pattern)); + +#ifdef GDB + for (uint64_t i = 0; i < ifmap_size; i++) { + printf("source if[%lu] is bf16 %f (bf16)with 0x%x\n", i, convert_bf16_fp32(ifmap[i]), + ifmap[i]); + } +#endif + } else { + int table_hw = 256; + for (uint64_t i = 0; i < ifmap_size; i++) { + // input range is -8 ~ +8 + float input = ((int)i % 7) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002; + // float input = ((int)i % 10) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % + // table_hw) * 0.002; + // assert(check_input_int8_range(input)); + ifmap[i] = convert_fp32_bf16(input); +#ifdef GDB + printf("source if[%lu] is bf16 %f, input is %f (bf16)with 0x%x\n", i, + convert_bf16_fp32(ifmap[i]), input, ifmap[i]); +#endif + } + } +} + +static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) { + // TODO: check more shape / align + cvk_tl_shape_t ifmap_shape; + if (mode == PRE_DATA_COMPARE_FIX || mode == PRE_DATA_MAX_ERROR) { + ifmap_shape = {1, 32, 8, 8}; + } else { + ifmap_shape = {1, 32, 16, 16}; + } + + cvk_fmt_t fmt = CVK_FMT_BF16; + + // get table / input shape + cvk_tl_shape_t table_shape; + cvm_table_shape(bmk, &table_shape); + cvk_tl_shape_t ofmap_shape = ifmap_shape; + + uint64_t ifmap_size = tl_shape_size(&ifmap_shape); + uint64_t table_size = tl_shape_size(&table_shape); + uint64_t ofmap_size = tl_shape_size(&ofmap_shape); + + int data_type_size = bytesize_of_fmt(fmt); + uint64_t ifmap_bytesize = ifmap_size * data_type_size; + uint64_t table_bytesize = table_size * data_type_size; + uint64_t ofmap_bytesize = ofmap_size * data_type_size; + + // alloc tg + uint16_t *ifmap = (uint16_t *)xmalloc(ifmap_bytesize); + uint16_t *table_data = (uint16_t *)xmalloc(table_bytesize); + uint16_t *table_data_slope = (uint16_t *)xmalloc(table_bytesize); + uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize); + + // range depend on ur activation + int range_start = -8; + int range_end = 8; + float scale = cvm_sigmoid_scale(range_start, range_end); + + // fill tg value + gen_input(ifmap, ifmap_size); + cvm_sigmoid_tbl(table_data, table_data_slope, &table_shape, range_start, range_end); + tl_lut_ref(ref_data, ifmap, table_data, table_data_slope, ifmap_shape, table_shape, range_start, + range_end); + + // alloc tl + cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1); + cvk_tl_t *cvk_tl_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *cvk_tl_table_answer_slope = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1); + + // sys->local + test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)ifmap); + test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer, (uint8_t *)table_data); + test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer_slope, (uint8_t *)table_data_slope); + + // emit core function + cvm_emit_sigmoid(bmk, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_slope, + tl_ofmap_bf16, scale); + + uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, tl_ofmap_bf16); + + verify(ofmap_data, ref_data, ofmap_size); + + free_tl(bmk, tl_ofmap_bf16); + free_tl(bmk, tl_buf); + free_tl(bmk, cvk_tl_table_answer_slope); + free_tl(bmk, cvk_tl_table_answer); + free_tl(bmk, tl_ifmap); + + free(ifmap); + free(table_data); + free(table_data_slope); + free(ref_data); + free(ofmap_data); +} + +int main() { + CVI_RT_HANDLE ctx; + cvk_context_t *bmk; + int round_mode; + + round_mode = set_store_feround(); + + test_init(&ctx, &bmk); + + for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) { + // for (int i = GEN_DATA_MAX_ERROR; i < TEST_MODE_MAX; i++) { + // for (int i = PRE_DATA_MAX_ERROR; i < GEN_DATA_MAX_ERROR; i++) { + mode = static_cast(i); + printf("test mode %d...\n", mode); + testbench(&ctx, bmk); + } + + test_exit(&ctx, bmk); + restore_feround(round_mode); + return 0; +} diff --git a/cvimath/tests/cvi1835/sqrt.cpp b/cvimath/tests/cvi1835/sqrt.cpp new file mode 100644 index 000000000..60bbacb2d --- /dev/null +++ b/cvimath/tests/cvi1835/sqrt.cpp @@ -0,0 +1,375 @@ +/** + */ +#include +#include + +#include +#include +#include +#include +#include +#include +//#define DBG + +using namespace std; + +/** + * pre_data means we test fixed pattern, it should be same sa lut + */ +enum TEST_MODE { + PRE_DATA_COMPARE_FIX = 0, // pre-data + fix compare + GEN_POW_20_DATA_MAX_ERROR, // generate 2^-20 ~ 2^20 value that check epsilon + TEST_MODE_MAX, +}; + +static TEST_MODE mode; + +static uint16_t test_pattern[] = { + 0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52, 0x3A6C, 0x3A83, 0x3A90, + 0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF, 0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17, + 0x3B1D, 0x3B24, 0x3B2A, 0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65, + 0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90, 0x3B93, 0x3B97, 0x3B9A, + 0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE, 0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1, + 0x3BC5, 0x3BC8, 0x3BCB, 0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9, + 0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03, 0x3C05, 0x3C06, 0x3C08, + 0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12, 0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C, + 0x3C1D, 0x3C1F, 0x3C21, 0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F, + 0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E, 0x3C40, 0x3C41, 0x3C43, + 0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D, 0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57, + 0x3C58, 0x3C5A, 0x3C5C, 0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A, + 0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79, 0x3C7B, 0x3C7C, 0x3C7E, + 0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84, 0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89, + 0x3C8A, 0x3C8A, 0x3C8B, 0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93, + 0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A, 0x3C9B, 0x3C9C, 0x3C9C, + 0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1, 0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6, + 0x3CA7, 0x3CA8, 0x3CA9, 0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0, + 0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8, 0x3CB8, 0x3CB9, 0x3CBA, + 0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF, 0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4, + 0x3CC5, 0x3CC5, 0x3CC6, 0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE, + 0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5, 0x3CD6, 0x3CD7, 0x3CD7, + 0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC, 0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1, + 0x3CE2, 0x3CE3, 0x3CE4, 0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB, + 0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2, 0x3CF3, 0x3CF4, 0x3CF5, + 0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA, 0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF, + 0x3D00, 0x3D00, 0x3D01, 0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04, + 0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08, 0x3D08, 0x3D09, 0x3D09, + 0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C, 0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E, + 0x3D0F, 0x3D0F, 0x3D0F, 0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13, + 0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17, 0x3D17, 0x3D18, 0x3D18, + 0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A, 0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D, + 0x3D1D, 0x3D1E, 0x3D1E, 0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22, + 0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25, 0x3D26, 0x3D26, 0x3D27, + 0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29, 0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C, + 0x3D2C, 0x3D2C, 0x3D2D, 0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31, + 0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34, 0x3D35, 0x3D35, 0x3D35, + 0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38, 0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A, + 0x3D3B, 0x3D3B, 0x3D3C, 0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F, + 0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43, 0x3D43, 0x3D44, 0x3D44, + 0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47, 0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49, + 0x3D4A, 0x3D4A, 0x3D4A, 0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E, + 0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52, 0x3D52, 0x3D53, 0x3D53, + 0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55, 0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58, + 0x3D58, 0x3D59, 0x3D59, 0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D, + 0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60, 0x3D61, 0x3D61, 0x3D62, + 0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64, 0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67, + 0x3D67, 0x3D67, 0x3D68, 0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C, + 0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F, 0x3D70, 0x3D70, 0x3D70, + 0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73, 0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75, + 0x3D76, 0x3D76, 0x3D77, 0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A, + 0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E, 0x3D7E, 0x3D7F, 0x3D7F, + 0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82, + 0x3D82, 0x3D82, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85, + 0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D87, 0x3D87, 0x3D87, + 0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89, + 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C, + 0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, + 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91, + 0x3D91, 0x3D91, 0x3D91, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93, + 0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95, 0x3D95, 0x3D96, 0x3D96, + 0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98, + 0x3D98, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B, + 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9D, 0x3D9D, 0x3D9D, + 0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0, + 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2, + 0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, + 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7, + 0x3DA7, 0x3DA7, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, + 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAC, 0x3DAC, + 0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE, + 0x3DAE, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1, + 0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3, + 0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6, + 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8, + 0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBB, + 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD, + 0x3DBD, 0x3DBD, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, + 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC2, 0x3DC2, 0x3DC2, + 0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, + 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7, + 0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9, 0x3DC9, 0x3DC9, 0x3DC9, + 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC, + 0x3DCC, 0x3DCC, 0x3DCC, 0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5, + 0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE, 0x3DDF, 0x3DE0, 0x3DE1, + 0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5, +}; + +static uint16_t test_pattern_ref[] = { + 0x0, 0x3c24, 0x3c68, 0x3c8e, 0x3ca4, 0x3cb7, 0x3cc8, 0x3cd9, 0x3ce8, 0x3cf6, 0x3d01, 0x3d08, + 0x3d0e, 0x3d14, 0x3d19, 0x3d1f, 0x3d24, 0x3d29, 0x3d2e, 0x3d33, 0x3d37, 0x3d3c, 0x3d40, 0x3d45, + 0x3d48, 0x3d4d, 0x3d51, 0x3d55, 0x3d59, 0x3d5d, 0x3d61, 0x3d64, 0x3d68, 0x3d6b, 0x3d6f, 0x3d72, + 0x3d76, 0x3d79, 0x3d7c, 0x3d80, 0x3d81, 0x3d83, 0x3d85, 0x3d86, 0x3d88, 0x3d89, 0x3d8b, 0x3d8c, + 0x3d8e, 0x3d90, 0x3d91, 0x3d92, 0x3d94, 0x3d95, 0x3d97, 0x3d98, 0x3d99, 0x3d9b, 0x3d9c, 0x3d9d, + 0x3d9f, 0x3da0, 0x3da1, 0x3da2, 0x3da4, 0x3da5, 0x3da6, 0x3da8, 0x3da9, 0x3daa, 0x3dab, 0x3dad, + 0x3dae, 0x3daf, 0x3db0, 0x3db1, 0x3db3, 0x3db4, 0x3db5, 0x3db6, 0x3db7, 0x3db9, 0x3db9, 0x3dbb, + 0x3dbc, 0x3dbd, 0x3dbe, 0x3dbf, 0x3dc0, 0x3dc1, 0x3dc2, 0x3dc3, 0x3dc5, 0x3dc5, 0x3dc7, 0x3dc8, + 0x3dc8, 0x3dca, 0x3dcb, 0x3dcc, 0x3dcd, 0x3dce, 0x3dcf, 0x3dd0, 0x3dd1, 0x3dd2, 0x3dd3, 0x3dd4, + 0x3dd5, 0x3dd6, 0x3dd7, 0x3dd8, 0x3dd9, 0x3dda, 0x3ddb, 0x3ddb, 0x3ddd, 0x3dde, 0x3dde, 0x3ddf, + 0x3de1, 0x3de1, 0x3de2, 0x3de3, 0x3de4, 0x3de5, 0x3de6, 0x3de7, 0x3de8, 0x3de8, 0x3dea, 0x3deb, + 0x3deb, 0x3dec, 0x3ded, 0x3dee, 0x3def, 0x3def, 0x3df1, 0x3df2, 0x3df2, 0x3df3, 0x3df4, 0x3df5, + 0x3df6, 0x3df7, 0x3df7, 0x3df8, 0x3df9, 0x3dfa, 0x3dfb, 0x3dfb, 0x3dfc, 0x3dfd, 0x3dfe, 0x3dff, + 0x3e00, 0x3e00, 0x3e00, 0x3e01, 0x3e01, 0x3e02, 0x3e02, 0x3e03, 0x3e03, 0x3e03, 0x3e04, 0x3e04, + 0x3e05, 0x3e05, 0x3e05, 0x3e06, 0x3e06, 0x3e07, 0x3e07, 0x3e07, 0x3e08, 0x3e08, 0x3e09, 0x3e09, + 0x3e09, 0x3e0a, 0x3e0a, 0x3e0b, 0x3e0b, 0x3e0b, 0x3e0b, 0x3e0c, 0x3e0c, 0x3e0d, 0x3e0d, 0x3e0d, + 0x3e0e, 0x3e0e, 0x3e0f, 0x3e0f, 0x3e10, 0x3e10, 0x3e10, 0x3e10, 0x3e11, 0x3e11, 0x3e11, 0x3e12, + 0x3e12, 0x3e13, 0x3e13, 0x3e14, 0x3e14, 0x3e14, 0x3e14, 0x3e15, 0x3e15, 0x3e15, 0x3e16, 0x3e16, + 0x3e17, 0x3e17, 0x3e17, 0x3e17, 0x3e18, 0x3e18, 0x3e19, 0x3e19, 0x3e19, 0x3e19, 0x3e1a, 0x3e1a, + 0x3e1b, 0x3e1b, 0x3e1b, 0x3e1c, 0x3e1c, 0x3e1c, 0x3e1d, 0x3e1d, 0x3e1d, 0x3e1e, 0x3e1e, 0x3e1e, + 0x3e1f, 0x3e1f, 0x3e1f, 0x3e20, 0x3e20, 0x3e20, 0x3e21, 0x3e21, 0x3e21, 0x3e22, 0x3e22, 0x3e22, + 0x3e22, 0x3e23, 0x3e23, 0x3e24, 0x3e24, 0x3e24, 0x3e24, 0x3e25, 0x3e25, 0x3e26, 0x3e26, 0x3e26, + 0x3e26, 0x3e27, 0x3e27, 0x3e27, 0x3e28, 0x3e28, 0x3e28, 0x3e29, 0x3e29, 0x3e29, 0x3e29, 0x3e2a, + 0x3e2a, 0x3e2a, 0x3e2b, 0x3e2b, 0x3e2b, 0x3e2c, 0x3e2c, 0x3e2c, 0x3e2d, 0x3e2d, 0x3e2d, 0x3e2d, + 0x3e2e, 0x3e2e, 0x3e2f, 0x3e2f, 0x3e2f, 0x3e2f, 0x3e30, 0x3e30, 0x3e30, 0x3e30, 0x3e31, 0x3e31, + 0x3e31, 0x3e32, 0x3e32, 0x3e32, 0x3e33, 0x3e33, 0x3e33, 0x3e33, 0x3e34, 0x3e34, 0x3e34, 0x3e35, + 0x3e35, 0x3e35, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e37, 0x3e37, 0x3e37, 0x3e38, 0x3e38, + 0x3e39, 0x3e39, 0x3e39, 0x3e39, 0x3e39, 0x3e3a, 0x3e3a, 0x3e3b, 0x3e3b, 0x3e3b, 0x3e3b, 0x3e3b, + 0x3e3c, 0x3e3c, 0x3e3c, 0x3e3d, 0x3e3d, 0x3e3d, 0x3e3d, 0x3e3d, 0x3e3e, 0x3e3e, 0x3e3f, 0x3e3f, + 0x3e3f, 0x3e3f, 0x3e3f, 0x3e40, 0x3e40, 0x3e41, 0x3e41, 0x3e41, 0x3e41, 0x3e41, 0x3e42, 0x3e42, + 0x3e42, 0x3e43, 0x3e43, 0x3e43, 0x3e43, 0x3e44, 0x3e44, 0x3e44, 0x3e45, 0x3e45, 0x3e45, 0x3e45, + 0x3e45, 0x3e46, 0x3e46, 0x3e47, 0x3e47, 0x3e47, 0x3e47, 0x3e47, 0x3e48, 0x3e48, 0x3e48, 0x3e48, + 0x3e48, 0x3e49, 0x3e49, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4b, 0x3e4b, 0x3e4b, 0x3e4c, + 0x3e4c, 0x3e4c, 0x3e4c, 0x3e4c, 0x3e4d, 0x3e4d, 0x3e4e, 0x3e4e, 0x3e4e, 0x3e4e, 0x3e4e, 0x3e4f, + 0x3e4f, 0x3e4f, 0x3e4f, 0x3e4f, 0x3e50, 0x3e50, 0x3e51, 0x3e51, 0x3e51, 0x3e51, 0x3e51, 0x3e52, + 0x3e52, 0x3e52, 0x3e52, 0x3e52, 0x3e53, 0x3e53, 0x3e53, 0x3e54, 0x3e54, 0x3e54, 0x3e54, 0x3e55, + 0x3e55, 0x3e55, 0x3e55, 0x3e55, 0x3e56, 0x3e56, 0x3e56, 0x3e57, 0x3e57, 0x3e57, 0x3e57, 0x3e57, + 0x3e58, 0x3e58, 0x3e58, 0x3e58, 0x3e59, 0x3e59, 0x3e59, 0x3e5a, 0x3e5a, 0x3e5a, 0x3e5a, 0x3e5a, + 0x3e5b, 0x3e5b, 0x3e5b, 0x3e5b, 0x3e5b, 0x3e5c, 0x3e5c, 0x3e5d, 0x3e5d, 0x3e5d, 0x3e5d, 0x3e5d, + 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5f, 0x3e5f, 0x3e5f, 0x3e5f, 0x3e5f, 0x3e60, 0x3e60, + 0x3e61, 0x3e61, 0x3e61, 0x3e61, 0x3e61, 0x3e62, 0x3e62, 0x3e62, 0x3e62, 0x3e62, 0x3e63, 0x3e63, + 0x3e63, 0x3e63, 0x3e63, 0x3e64, 0x3e64, 0x3e65, 0x3e65, 0x3e65, 0x3e65, 0x3e65, 0x3e66, 0x3e66, + 0x3e66, 0x3e66, 0x3e66, 0x3e67, 0x3e67, 0x3e67, 0x3e67, 0x3e67, 0x3e68, 0x3e68, 0x3e68, 0x3e68, + 0x3e68, 0x3e69, 0x3e69, 0x3e6a, 0x3e6a, 0x3e6a, 0x3e6a, 0x3e6a, 0x3e6b, 0x3e6b, 0x3e6b, 0x3e6b, + 0x3e6b, 0x3e6c, 0x3e6c, 0x3e6c, 0x3e6c, 0x3e6c, 0x3e6d, 0x3e6d, 0x3e6d, 0x3e6d, 0x3e6d, 0x3e6e, + 0x3e6e, 0x3e6e, 0x3e6e, 0x3e6e, 0x3e6f, 0x3e6f, 0x3e6f, 0x3e6f, 0x3e6f, 0x3e70, 0x3e70, 0x3e71, + 0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e72, 0x3e72, 0x3e72, 0x3e72, 0x3e72, 0x3e73, 0x3e73, 0x3e73, + 0x3e73, 0x3e73, 0x3e74, 0x3e74, 0x3e74, 0x3e74, 0x3e74, 0x3e75, 0x3e75, 0x3e75, 0x3e75, 0x3e76, + 0x3e76, 0x3e76, 0x3e76, 0x3e76, 0x3e77, 0x3e77, 0x3e77, 0x3e77, 0x3e77, 0x3e78, 0x3e78, 0x3e78, + 0x3e78, 0x3e78, 0x3e79, 0x3e79, 0x3e79, 0x3e79, 0x3e79, 0x3e7a, 0x3e7a, 0x3e7a, 0x3e7a, 0x3e7a, + 0x3e7b, 0x3e7b, 0x3e7b, 0x3e7b, 0x3e7b, 0x3e7c, 0x3e7c, 0x3e7c, 0x3e7c, 0x3e7c, 0x3e7d, 0x3e7d, + 0x3e7d, 0x3e7d, 0x3e7d, 0x3e7e, 0x3e7e, 0x3e7e, 0x3e7e, 0x3e7f, 0x3e7f, 0x3e7f, 0x3e7f, 0x3e7f, + 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e81, 0x3e81, 0x3e81, + 0x3e81, 0x3e81, 0x3e81, 0x3e81, 0x3e81, 0x3e81, 0x3e81, 0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e82, + 0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e83, 0x3e83, 0x3e83, 0x3e83, 0x3e83, 0x3e83, 0x3e83, 0x3e83, + 0x3e83, 0x3e83, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, + 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e86, 0x3e86, + 0x3e86, 0x3e86, 0x3e86, 0x3e86, 0x3e86, 0x3e86, 0x3e86, 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e87, + 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e88, 0x3e88, 0x3e88, 0x3e88, 0x3e88, 0x3e88, 0x3e88, + 0x3e88, 0x3e88, 0x3e88, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, + 0x3e89, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8b, 0x3e8b, + 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, + 0x3e8b, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8d, + 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8e, 0x3e8e, 0x3e8e, + 0x3e8e, 0x3e8e, 0x3e8e, 0x3e8e, 0x3e8e, 0x3e8e, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f, + 0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, + 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e91, + 0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e92, 0x3e92, 0x3e92, 0x3e92, 0x3e92, 0x3e92, 0x3e92, + 0x3e92, 0x3e92, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, + 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, + 0x3e94, 0x3e94, 0x3e94, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, + 0x3e95, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e97, 0x3e97, + 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, + 0x3e97, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e99, + 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, + 0x3e99, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9b, + 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9c, 0x3e9c, 0x3e9c, + 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, + 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9e, 0x3e9e, 0x3e9e, + 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, + 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3ea0, 0x3ea0, + 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, + 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea2, 0x3ea2, + 0x3ea2, 0x3ea2, 0x3ea2, 0x3ea2, 0x3ea2, 0x3ea3, 0x3ea3, 0x3ea4, 0x3ea4, 0x3ea4, 0x3ea5, 0x3ea5, + 0x3ea6, 0x3ea6, 0x3ea6, 0x3ea7, 0x3ea7, 0x3ea7, 0x3ea8, 0x3ea8, 0x3ea9, 0x3ea9, 0x3ea9, 0x3eaa, + 0x3eaa, 0x3eaa, 0x3eab, 0x3eab, +}; + +static void tl_lut_ref(uint16_t *ofmap, uint16_t *ifmap, cvk_tl_shape_t ifmap_shape) { + for (uint32_t i = 0; i < tl_shape_size(&ifmap_shape); i++) { + if (mode == PRE_DATA_COMPARE_FIX) { + ofmap[i] = test_pattern_ref[i]; + } else { + ofmap[i] = convert_fp32_bf16(pow(convert_bf16_fp32(ifmap[i]), 0.5)); + } + } +} + +static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint16_t *ifmap, + uint64_t ifmap_shape_size, TEST_MODE mode) { + uint64_t size = ifmap_shape_size; + + for (uint64_t i = 0; i < size; i++) { + bool is_close; + uint16_t ref; + uint16_t ofmap_data_bf16; + float ref_f; + float ofmap_data_f; + + ref = ref_data[i]; + ref_f = convert_bf16_fp32(ref); + ofmap_data_f = convert_bf16_fp32(ofmap_data[i]); + ofmap_data_bf16 = ofmap_data[i]; + + if (mode == PRE_DATA_COMPARE_FIX) { + is_close = ofmap_data[i] == ref; + } else { + is_close = fabs(ref_f - ofmap_data_f) < 0.001; + } + + if (!is_close) { + fprintf(stderr, + "comparing failed at ofmap_data[%lu](input:%e), got %x, exp %x, " + "fp32: got %e exp %e\n", + i, convert_bf16_fp32(ifmap[i]), ofmap_data_bf16, ref, ofmap_data_f, ref_f); + exit(-1); + } + } + + return true; +} + +static void gen_input(uint16_t *ifmap, uint64_t ifmap_shape_size) { + if (mode == PRE_DATA_COMPARE_FIX) { + memcpy(ifmap, &test_pattern, sizeof(test_pattern)); + } else { + for (uint64_t i = 0; i < ifmap_shape_size; i++) { + srand(static_cast(time(0))); + std::random_device rd; + std::mt19937 e2(rd()); + float LO = pow(2, -10); + float HI = pow(2, 10); + // std::uniform_real_distribution<> dist(pow(2,-62), pow(2,63)); + for (uint64_t i = 0; i < ifmap_shape_size; i++) { + // float r3 = dist(e2); + float r3 = LO + static_cast(rand()) / (static_cast(RAND_MAX / (HI - LO))); + ifmap[i] = convert_fp32_bf16(r3); + } + } + } + +#ifdef DBG + for (uint64_t i = 0; i < ifmap_shape_size; i++) { + printf("source if[%lu] bf16 %f 0x%x, log2f is %f\n", i, convert_bf16_fp32(ifmap[i]), ifmap[i], + floor(log2((convert_bf16_fp32(ifmap[i]))))); + } +#endif /* ifdef DBG */ +} + +static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk, uint32_t input_n, uint32_t input_c, + uint32_t input_h, uint32_t input_w) { + cvk_fmt_t fmt = CVK_FMT_BF16; + + // TODO: check more shape / align + cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w}; + cvk_tl_shape_t ofmap_shape = ifmap_shape; + cvk_tl_shape_t table_shape; + cvm_table_shape(bmk, &table_shape); + + uint64_t ifmap_shape_size = tl_shape_size(&ifmap_shape); + uint64_t ofmap_size = tl_shape_size(&ofmap_shape); + uint64_t table_size = tl_shape_size(&table_shape); + + // prepare input data with size + int data_type_size = bytesize_of_fmt(fmt); + uint64_t ifmap_bytesize = ifmap_shape_size * data_type_size; + uint64_t ofmap_bytesize = ofmap_size * data_type_size; + uint64_t table_bytesize = table_size * data_type_size; + + uint16_t *ifmap = (uint16_t *)xmalloc(ifmap_bytesize); + uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize); + uint16_t *table_data = (uint16_t *)xmalloc(table_bytesize); + uint16_t *table_data_mantissa = (uint16_t *)xmalloc(table_bytesize); + + // alloc lmem + cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1); + cvk_tl_t *tl_buf = test_alloc_tl(bmk, tl_ifmap->shape, fmt, /*align*/ 1); + cvk_tl_t *cvk_tl_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + cvk_tl_t *cvk_tl_table_answer_mantissa = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1); + + // generate testbench + gen_input(ifmap, ifmap_shape_size); + tl_lut_ref(ref_data, ifmap, ifmap_shape); + + // prepare table + cvm_sqrt_tbl(table_data, table_data_mantissa, &table_shape); + + // sys->lmem + test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)ifmap); + test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer, (uint8_t *)table_data); + test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer_mantissa, (uint8_t *)table_data_mantissa); + + cvm_emit_sqrt(bmk, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_mantissa, + tl_ofmap_bf16); + + // issue cmd + test_submit_comp(ctx, bmk); + + // get output from lmem->sys + uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, tl_ofmap_bf16); + + verify(ofmap_data, ref_data, ifmap, ifmap_shape_size, mode); + + free_tl(bmk, cvk_tl_table_answer_mantissa); + free_tl(bmk, cvk_tl_table_answer); + free_tl(bmk, tl_buf); + free_tl(bmk, tl_ofmap_bf16); + free_tl(bmk, tl_ifmap); + + free(ifmap); + free(ref_data); + free(ofmap_data); + free(table_data); + free(table_data_mantissa); +} + +int main() { + CVI_RT_HANDLE ctx; + cvk_context_t *bmk; + int round_mode; + + round_mode = set_store_feround(); + + test_init(&ctx, &bmk); + + for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) { + mode = static_cast(i); + printf("test mode %d...\n", mode); + + int input_n = 1; + int input_c = 32; + int input_h = 1; + int input_w = 1; + + if (mode == PRE_DATA_COMPARE_FIX) { + input_h = 4; + input_w = 8; + } else { + input_h = input_w = 16; + } + + testbench(&ctx, bmk, input_n, input_c, input_h, input_w); + } + + test_exit(&ctx, bmk); + restore_feround(round_mode); + return 0; +} diff --git a/cvimath/tests/include/test_native_ref.h b/cvimath/tests/include/test_native_ref.h new file mode 100644 index 000000000..34ec5ecd9 --- /dev/null +++ b/cvimath/tests/include/test_native_ref.h @@ -0,0 +1,383 @@ +#ifndef _BM_NATIVE_REF_H_ +#define _BM_NATIVE_REF_H_ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef union { + uint32_t ival; + float fval; +} IF_VAL; + +/* + * fp32 version + */ + +int array_cmp_float(const char *const info, float *p_exp, float *p_got, int count, float delta); +int array_cmp_int(const char *const info, int *p_exp, int *p_got, int count); + +/** + * @name calc_dilute_hw + * @brief calculate diluted dimention + * @ingroup libbmutils + * + * @param [in] h origin dimention + * @param [in] ins_h scaleing factor, 0 -> no scaling + * @param [in] ins_h_l compensation value after last value in each row + * @param [in] pad_h_b extra padding left ofr bottom + * @param [in] pad_h_t extra padding right or top + * + * @retval diluted value + */ +int calc_dilute_hw(int h, int ins_h, int ins_h_l, int pad_h_b, int pad_h_t); + +/** + * @name calc_output_hw + * @brief calculate output dimention by kernel and stride size + * @ingroup libbmutils + * + * @param [in] hw origin dimention + * @param [in] kwh scaling factor, 0 -> no scaling + * @param [in] stride compensation value after last value in each row + * + * @retval output dimention + */ +int calc_output_hw(int hw, int khw, int stride); + +/** + * @name fill_pad_fmap_fp32 + * @brief fill padded feature map with unpadded map + * @ingroup libbmutils + * + * @param [in] before input array + * @param [out] pbefore output array reference, if NULL, alloc a new one + * @param [in] pad_val padding value + * @param [in] pad_l padding left size + * @param [in] pad_r padding right size + * @param [in] pad_t padding top size + * @param [in] pad_b padding bottom size + * @param [in] ins_h scaling factor h + * @param [in] ins_w scaling factor w + * @param [in] ins_h_last compensation value after last value in each row + * @param [in] ins_w_last compensation value after last value in each col + * @param [in] h_before origin height + * @param [in] w_before origin width + * + * @retval BM_SUCCESS success + * @retval BM_ERR_INVALID_ARGUMENT before or pafter is null pointer + * @retval BM_ERR_NOMEM can't alloc new output array + */ +int fill_pad_fmap_fp32(const float *before, float **after, float pad_value, int pad_t, int pad_b, + int pad_l, int pad_r, int ins_h, int ins_w, int ins_h_last, int ins_w_last, + int h_before, int w_before); + +void native_md_scalar(float *a, float *b, float *r, int N, int C, int H, int W, int op, + bool result_add); + +void native_conv_ref(const void *ifmap, void *ofmap, const void *weight, int input_n, int input_c, + int input_h, int input_w, int output_c, int output_h, int output_w, int groups, + int kh, int kw, int dilation_h, int dilation_w, int pad_h, int pad_w, + int stride_h, int stride_w, int flip, int using_bias, const void *bias, + int result_add); + +void native_pooling_forward_max(const float *bottom_data, float *top_data, int *mask_data, + const int count, const int num, const int channels, + const int height, const int width, const int pooled_height, + const int pooled_width, const int kernel_h, const int kernel_w, + const int stride_h, const int stride_w, const int pad_h, + const int pad_w); + +void native_pooling_forward_ave(const float *bottom_data, float *top_data, const int count, + const int num, const int channels, const int height, + const int width, const int pooled_height, const int pooled_width, + const int kernel_h, const int kernel_w, const int stride_h, + const int stride_w, const int pad_h, const int pad_w); + +/* + * int8 vresion + */ + +/** + * @name array_cmp_int8 + * @brief compare the contect of p_exp and p_got and print the error index + * and value + * @ingroup libbmutils + * + * @param [in] info informataion string printed when encounter error + * @param [in] p_exp input array + * @param [in] p_got length of input array + * @param [in] len length of input array + * @retval 0 no error + * @retval -1 error occur + */ +int array_cmp_int8(const char *const info, const int8_t *p_exp, const int8_t *p_got, int count); + +/** + * @name fill_pad_fmap_int8 + * @brief fill padded feature map with unpadded map + * @ingroup libbmutils + * + * @param [in] before input array + * @param [out] pbefore output array reference, if NULL, alloc a new one + * @param [in] pad_val padding value + * @param [in] pad_l padding left size + * @param [in] pad_r padding right size + * @param [in] pad_t padding top size + * @param [in] pad_b padding bottom size + * @param [in] ins_h scaling factor h + * @param [in] ins_w scaling factor w + * @param [in] ins_h_last compensation value after last value in each row + * @param [in] ins_w_last compensation value after last value in each col + * @param [in] h_before origin height + * @param [in] w_before origin width + * + * @retval BM_SUCCESS success + * @retval BM_ERR_INVALID_ARGUMENT before or pafter is null pointer + * @retval BM_ERR_NOMEM can't alloc new output array + */ +int fill_pad_fmap_int8(const int8_t *before, int8_t **pafter, int pad_val, int pad_l, int pad_r, + int pad_t, int pad_b, int ins_h, int ins_w, int ins_h_last, int ins_w_last, + int h_before, int w_before); + +int fill_pad_fmap_bf16(const unsigned short *before, unsigned short **pafter, int pad_val, + int pad_l, int pad_r, int pad_t, int pad_b, int ins_h, int ins_w, + int ins_h_last, int ins_w_last, int h_before, int w_before); + +/** + * @name fill_int_with_int8 + * @brief (int) pdest[i] = (int8_t)pdest[i] for each element + * @ingroup libbmutils + * + * @param [out] pdest output array + * @param [in] psrc input array + * @param [in] len length of input array + */ +void fill_int_with_int8(int *pdest, int8_t *psrc, int len); + +/** + * @name fill_int_with_uint8 + * @brief (int) pdest[i] = (int16_t)pdest[i] for each element + * @ingroup libbmutils + * + * @param [out] pdest output array + * @param [in] psrc input array + * @param [in] len length of input array + */ +void fill_int_with_uint8(int *pdest, uint8_t *psrc, int len); + +/** + * @name fill_int_with_int16 + * @brief (int) pdest[i] = (int16_t)pdest[i] for each element + * @ingroup libbmutils + * + * @param [out] pdest output array + * @param [in] psrc input array + * @param [in] len length of input array + */ +void fill_int_with_int16(int *pdest, int16_t *psrc, int len); + +void native_md_scalar_int8(int8_t *a, int8_t *b, int8_t *r, int N, int C, int H, int W, int op, + bool result_add); + +/** + * @name inner_product + * @brief inner product of two array + * @ingroup libbmutils + * + * @param [in] a input array 0 + * @param [in] b input array 1 + * @param [in] len length of a or b + * @param [out] c store the summation + */ +void inner_product(const int *a, const int *b, int len, int *c); +void inner_float_product(const float *a, const float *b, int len, float *c); + +/** + * @name native_conv_int8 + * @brief do convolution specific 8bit feature map + * @ingroup libbmutils + * + * @param [in] ifmap input array + * @param [in] weight weight data array + * @param [in] bias bias array if !NULL, add bias + * @param [out] ofmap lenght of input array + * @param [in] in input batch size + * @param [in] ic input channel size + * @param [in] ih input height + * @param [in] iw input width + * @param [in] oc output channle size + * @param [in] kh kernel height + * @param [in] kw kernel width + * @param [in] dh kernel dilute height factor + * @param [in] dw kernel dilute width factor + * @param [in] pad_h_t padding top size + * @param [in] pad_h_b padding bottom size + * @param [in] pad_w_l padding left size + * @param [in] pad_w_r padding right size + * @param [in] stride_h stride height + * @param [in] stride_w stride width + * @param [in] ins_h insert extra element for each i_fmap row + * @param [in] ins_w insert extra element for each i_fmap col + * @param [in] ins_h_last insert extra element for last i_fmap row + * @param [in] ins_w_last insert extra element for last i_fmap col + * @param [in] input_sign i_fmap data type. 0 => signed, 1 => unsigned + * @param [in] r_shift_width scale bit for saturation + * + * @retval BM_SUCCESS success + * @retval other saturation failed + */ +int native_conv_int8(const int8_t *ifmap, const int8_t *weight, const int16_t *bias, int8_t *ofmap, + int in, int ic, int ih, int iw, int oc, int kh, int kw, int dh, int dw, + int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h, int stride_w, + int ins_h, int ins_w, int ins_h_last, int ins_w_last, int input_sign, + int r_shift_width, int do_relu); + +/** + * @name native_fc_int8 + * @brief do full-connected layer for specific feature map + * @ingroup libbmutils + * + * @param [in] L input array + * @param [in] R weight array + * @param [in] B bias array if !NULL, add bias + * @param [in] Y accumulation array if !NULL, add this + * @param [out] Y_ref output array + * @param [in] L_row_num input row size + * @param [in] L_col_num input col size + * @param [in] R_col_num weight + * @param [in] L_sign padding top size + * @param [in] R_sign padding top size + * @param [in] B_sign padding top size + * @param [in] L_shift_width padding top size + * @param [in] R_shift_width padding top size + * @param [in] is_result_int8 padding top size + * @param [in] do_relu padding top size + * + * @retval BM_SUCCESS success + * @retval other saturation failed + */ +int native_fc_int8(const int8_t *L, const int8_t *R, const int16_t *B, const int16_t *Y, int *Y_ref, + int L_row_num, int L_col_num, int R_col_num, int L_sign, int R_sign, int B_sign, + int l_shift_width, int r_shift_width, int is_result_int8, int do_relu); + +/** + * @name native_pooling_ave_int8 + * @brief do average pooling for specific feature map + * @ingroup libbmutils + * + * @param [in] i_fmap input array + * @param [in] weight weight data array + * @param [in] bias bias array if !NULL, add bias + * @param [out] o_fmap lenght of input array + * @param [in] pad_h_t padding top size + * @param [in] pad_h_b padding bottom size + * @param [in] pad_w_l padding left size + * @param [in] pad_w_r padding right size + * @param [in] stride_h stride height + * @param [in] stride_w stride width + * @param [in] ins_h insert extra element for each i_fmap row + * @param [in] ins_w insert extra element for each i_fmap col + * @param [in] ins_h_last insert extra element for last i_fmap row + * @param [in] ins_w_last insert extra element for last i_fmap col + * @param [in] input_sign i_fmap data type. 0 => signed, 1 => unsigned + * @param [in] satu_sign saturation data type. 0 => unsigned, 1 => signed + * @param [in] r_shift_width scale bit for saturation + * @param [in] const_weight if weight array has one uint8_t value + * + * @retval BM_SUCCESS success + * @retval BM_ERR_INVALID_ARGUMENT illegal kh/kw or r_shift_width + */ +int native_pooling_ave_int8(const int8_t *i_fmap, const void *weight, const int16_t *bias, + int8_t *o_fmap, int input_n, int input_c, int input_h, int input_w, + int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, + int stride_h, int stride_w, int ins_w, int ins_h, int ins_w_last, + int ins_h_last, int input_sign, int satu_sign, int r_shift_width, + int const_weight); + +/** + * @name native_pooling_max_int8 + * @brief do max pooling for specific feature map + * @ingroup libbmutils + * + * @param [in] i_fmap input array + * @param [out] o_fmap lenght of input array + * @param [in] pad_h_t padding top size + * @param [in] pad_h_b padding bottom size + * @param [in] pad_w_l padding left size + * @param [in] pad_w_r padding right size + * @param [in] stride_h stride height + * @param [in] stride_w stride width + * @param [in] ins_h insert extra element for each i_fmap row + * @param [in] ins_w insert extra element for each i_fmap col + * @param [in] ins_h_last insert extra element for last i_fmap row + * @param [in] ins_w_last insert extra element for last i_fmap col + * @param [in] input_sign i_fmap data type. 0 => unsigned, 1 => signed + * + * @retval BM_SUCCESS success + * @retval BM_ERR_INVALID_ARGUMENT illegal ins_h/w or ins_[hw]_last + */ +int native_pooling_max_int8(const int8_t *i_fmap, int8_t *o_fmap, int input_n, int input_c, + int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b, + int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h, + int ins_w, int ins_h_last, int ins_w_last, int input_sign); + +int native_pooling_max_fp32(const float *i_fmap, float *o_fmap, int input_n, int input_c, + int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b, + int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h, + int ins_w, int ins_h_last, int ins_w_last); + +int native_pooling_avg_fp32(const float *i_fmap, float *o_fmap, int input_n, int input_c, + int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b, + int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h, + int ins_w, int ins_h_last, int ins_w_last, float avg_pooling_const); + +int native_depthwise_fp32(const float *ifmap, const float *weight, const float *bias, float *ofmap, + int in, int ic, int ih, int iw, int kh, int kw, int dh, int dw, + int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h, + int stride_w, int ins_h, int ins_w, int ins_h_last, int ins_w_last); + +/** + * @name satu_2_8bit + * @brief saturate each signed or unsiged 8bit element in array + * @ingroup libbmutils + * + * @param [in] pBuff input array + * @param [in] len lenght of input array + * @param [out] pyByteOut output array + * @param [in] rshiftbits right shift bit if round_floor && value != 0 + * @param [in] round_floor enable floor rounding + * @param [in] sign_unsign 0 => unsigned, 1 => signed + * + * @retval BM_SUCCESS success + * @retval BM_ERR_INVALID_ARGUMENT rshiftbits < 0 + */ +int satu_2_8bit(const int *pBuff, int len, int8_t *pByteOut, int rshiftbits, int round_floor, + int sign_unsign); + +/** + * @name satu_2_16bit + * @brief saturate each signed or unsiged 16bit element in array + * @ingroup libbmutils + * + * @param [in] pBuff input array + * @param [in] len lenght of input array + * @param [out] pyByteOut output array + * @param [in] rshiftbits right shift bit if round_floor && value != 0 + * @param [in] round_floor enable floor rounding + * @param [in] sign_unsign 0 => unsigned, 1 => signed + * + * @retval BM_SUCCESS success + * @retval BM_ERR_INVALID_ARGUMENT rshiftbits < 0 + */ +int satu_2_16bit(const int *pBuff, int len, short *pByteOut, int rshiftbits, int round_floor, + int sign_unsign); +#ifdef __cplusplus +} +#endif + +#endif /* _BM_NATIVE_REF_H_ */ diff --git a/cvimath/tests/include/test_tf_quant_util.h b/cvimath/tests/include/test_tf_quant_util.h new file mode 100644 index 000000000..4ab497acf --- /dev/null +++ b/cvimath/tests/include/test_tf_quant_util.h @@ -0,0 +1,41 @@ +#ifndef TEST_TF_QUANT_UTIL_H +#define TEST_TF_QUANT_UTIL_H + +#include + +#define MAX(a, b) \ + ({ \ + __typeof__(a) _a = (a); \ + __typeof__(b) _b = (b); \ + _a > _b ? _a : _b; \ + }) + +#define MIN(a, b) \ + ({ \ + __typeof__(a) _a = (a); \ + __typeof__(b) _b = (b); \ + _a > _b ? _b : _a; \ + }) + +#ifdef __cplusplus +extern "C" { +#endif + +int32_t RoundingDivideByPOT(int32_t x, int exponent); +int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b); +int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int rshift); +void QuantizeMultiplierSmallerThanOne(float real_multiplier, uint32_t *quantized_multiplier, + int *right_shift); + +void pack_chl_quan_param(uint32_t channels, int has_bias, int32_t *bias, uint32_t *multiplier, + int8_t *rshift, uint8_t *packed_data); + +// 1880v2: 5bit right shift, [0, 31] +// 1822: 1bit sign, 5b shift, [-32, 31] +int8_t truncate_rshift(int8_t rshift, int8_t allow_lshift); + +#ifdef __cplusplus +} +#endif + +#endif // TEST_TF_QUANT_UTIL_H diff --git a/cvimath/toolchain/toolchain-aarch64-linux.cmake b/cvimath/toolchain/toolchain-aarch64-linux.cmake new file mode 100644 index 000000000..f02735d44 --- /dev/null +++ b/cvimath/toolchain/toolchain-aarch64-linux.cmake @@ -0,0 +1,52 @@ +include(CMakeForceCompiler) + +# usage +# cmake -DCMAKE_TOOLCHAIN_FILE=../toolchain-arm-linux.cmake ../ +# The Generic system name is used for embedded targets (targets without OS) in +# CMake +set( CMAKE_SYSTEM_NAME Linux ) +set( CMAKE_SYSTEM_PROCESSOR aarch64 ) + +# Set a toolchain path. You only need to set this if the toolchain isn't in +# your system path. Don't forget a trailing path separator! +set(TOOLCHAIN_TOPDIR "${TOOLCHAIN_ROOT_DIR}") +set( TC_PATH "${TOOLCHAIN_ROOT_DIR}/bin/" ) + +# The toolchain prefix for all toolchain executables +set( CROSS_COMPILE aarch64-linux-gnu- ) +set( ARCH arm64 ) + +# specify the cross compiler. We force the compiler so that CMake doesn't +# attempt to build a simple test program as this will fail without us using +# the -nostartfiles option on the command line +set(CMAKE_C_COMPILER ${TC_PATH}${CROSS_COMPILE}gcc) +set(CMAKE_CXX_COMPILER ${TC_PATH}${CROSS_COMPILE}g++) + +# To build the tests, we need to set where the target environment containing +# the required library is. On Debian-like systems, this is +# /usr/aarch64-linux-gnu. +SET(CMAKE_FIND_ROOT_PATH $ENV{TOOLCHAIN_TOPDIR}) +# search for programs in the build host directories +SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +# for libraries and headers in the target directories +SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + +# We must set the OBJCOPY setting into cache so that it's available to the +# whole project. Otherwise, this does not get set into the CACHE and therefore +# the build doesn't know what the OBJCOPY filepath is +set( CMAKE_OBJCOPY ${TC_PATH}${CROSS_COMPILE}objcopy + CACHE FILEPATH "The toolchain objcopy command " FORCE ) + +# Set the CMAKE C flags (which should also be used by the assembler! +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Os -std=gnu11" ) +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a" ) +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections" ) +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdata-sections" ) +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-pointer-to-int-cast" ) +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsigned-char" ) + +set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsigned-char" ) + +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" ) +set( CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" ) diff --git a/cvimath/toolchain/toolchain-gnueabihf-linux.cmake b/cvimath/toolchain/toolchain-gnueabihf-linux.cmake new file mode 100644 index 000000000..5a606ebbd --- /dev/null +++ b/cvimath/toolchain/toolchain-gnueabihf-linux.cmake @@ -0,0 +1,57 @@ +include(CMakeForceCompiler) + +# usage +# cmake -DCMAKE_TOOLCHAIN_FILE=../toolchain-arm-linux.cmake ../ +# The Generic system name is used for embedded targets (targets without OS) in +# CMake +set( CMAKE_SYSTEM_NAME Linux ) +set( CMAKE_SYSTEM_PROCESSOR arm ) + +# Set a toolchain path. You only need to set this if the toolchain isn't in +# your system path. Don't forget a trailing path separator! +set(TOOLCHAIN_TOPDIR "${TOOLCHAIN_ROOT_DIR}") +set( TC_PATH "${TOOLCHAIN_ROOT_DIR}/bin/" ) + +# The toolchain prefix for all toolchain executables +set( CROSS_COMPILE arm-linux-gnueabihf- ) +set( ARCH arm ) + +# specify the cross compiler. We force the compiler so that CMake doesn't +# attempt to build a simple test program as this will fail without us using +# the -nostartfiles option on the command line +set(CMAKE_C_COMPILER ${TC_PATH}${CROSS_COMPILE}gcc) +set(CMAKE_CXX_COMPILER ${TC_PATH}${CROSS_COMPILE}g++) + +# To build the tests, we need to set where the target environment containing +# the required library is. On Debian-like systems, this is +# /usr/aarch64-linux-gnu. +SET(CMAKE_FIND_ROOT_PATH $ENV{TOOLCHAIN_TOPDIR}) +# search for programs in the build host directories +SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER) +# for libraries and headers in the target directories +SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY) +SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY) + +# We must set the OBJCOPY setting into cache so that it's available to the +# whole project. Otherwise, this does not get set into the CACHE and therefore +# the build doesn't know what the OBJCOPY filepath is +set( CMAKE_OBJCOPY ${TC_PATH}${CROSS_COMPILE}objcopy + CACHE FILEPATH "The toolchain objcopy command " FORCE ) + +# Set the CMAKE C flags (which should also be used by the assembler! +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Os -std=gnu11" ) +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a" ) +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections" ) +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdata-sections" ) +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-pointer-to-int-cast" ) +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsigned-char" ) +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfloat-abi=hard" ) +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon-vfpv4" ) + +set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsigned-char" ) +set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a" ) +set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=hard" ) +set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon-vfpv4" ) + +set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" ) +set( CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )