From 83dc4914fef1fdae9f7d49e95fc0a4d8765eadcc Mon Sep 17 00:00:00 2001
From: carbon <carbon@milkv.io>
Date: Fri, 31 May 2024 11:54:07 +0800
Subject: [PATCH] add cvimath

commit ce8705f49da5e5f59c2ddb3253ef88323a0cd9c4
Author: sophgo-forum-service <forum_service@sophgo.com>
Date:   Mon May 13 14:04:10 2024 +0800

    [feat] cvimath opensource for cv18xx soc.

    - 9e8967
---
 .version/2024-05-31.md                        |    1 +
 cvimath/.clang-format                         |  108 ++
 cvimath/.gitignore                            |    5 +
 cvimath/CMakeLists.txt                        |   85 +
 cvimath/README.md                             |   21 +
 cvimath/clang-format.sh                       |    8 +
 cvimath/include/cvimath.h                     |   84 +
 cvimath/include/cvimath_internal.h            | 1066 +++++++++++++
 cvimath/include/test_cvikernel_util.h         |  393 +++++
 cvimath/sample/CMakeLists.txt                 |   28 +
 cvimath/sample/README.md                      |   21 +
 cvimath/sample/sample_bf16_fp32.cpp           |  130 ++
 cvimath/sample/sample_fp32_bf16.cpp           |  109 ++
 cvimath/sample/sample_gemm.cpp                |  312 ++++
 cvimath/sample/sample_mask.cpp                |  175 +++
 cvimath/sample/sample_reduce_mul.cpp          |  160 ++
 cvimath/sample/sample_set_val_by_mask.cpp     |  656 ++++++++
 .../sample/sample_sigmoid_linear_interp.cpp   |  165 ++
 cvimath/sample/sample_upsample.cpp            |  145 ++
 cvimath/src/1880v2_fp_convert.c               |  293 ++++
 cvimath/src/CMakeLists.txt                    |   12 +
 cvimath/src/bf16_gemm.c                       | 1361 +++++++++++++++++
 cvimath/src/blas_cpu.cpp                      |   82 +
 cvimath/src/chl_quan.cpp                      |  118 ++
 cvimath/src/common.c                          | 1032 +++++++++++++
 cvimath/src/fp32_bf16_kernel.c                |  138 ++
 cvimath/src/gen_lut.h                         |  207 +++
 cvimath/src/set_val_by_mask.c                 | 1169 ++++++++++++++
 cvimath/src/tiu_lut_atan.c                    | 1106 ++++++++++++++
 cvimath/src/tiu_lut_atan2.c                   |  787 ++++++++++
 cvimath/src/tiu_reciprocal.c                  |  149 ++
 cvimath/src/tiu_reshape_c.c                   |  387 +++++
 cvimath/src/tiu_sigmoid.c                     |  266 ++++
 cvimath/src/tiu_sqrt.c                        |  121 ++
 cvimath/src/tiu_upsample.c                    |   54 +
 cvimath/src/util.c                            |  270 ++++
 cvimath/tests/CMakeLists.txt                  |   34 +
 cvimath/tests/common/test_native_ref.c        |  980 ++++++++++++
 cvimath/tests/cvi1835/atan.cpp                |  477 ++++++
 cvimath/tests/cvi1835/atan2_degree.cpp        |  667 ++++++++
 cvimath/tests/cvi1835/atan2_radian.cpp        |  719 +++++++++
 cvimath/tests/cvi1835/bf16_fp32.cpp           |  148 ++
 cvimath/tests/cvi1835/blas_cpu.cpp            |   60 +
 cvimath/tests/cvi1835/blas_tpu.cpp            |  134 ++
 .../tests/cvi1835/depthwise_reshape_same.cpp  |  907 +++++++++++
 cvimath/tests/cvi1835/fp32_bf16.cpp           |  127 ++
 cvimath/tests/cvi1835/gemm.cpp                |  845 ++++++++++
 cvimath/tests/cvi1835/mask.cpp                |  158 ++
 cvimath/tests/cvi1835/reciprocal.cpp          |  376 +++++
 .../tests/cvi1835/sigmoid_linear_interp.cpp   |  907 +++++++++++
 cvimath/tests/cvi1835/sqrt.cpp                |  375 +++++
 cvimath/tests/include/test_native_ref.h       |  383 +++++
 cvimath/tests/include/test_tf_quant_util.h    |   41 +
 .../toolchain/toolchain-aarch64-linux.cmake   |   52 +
 .../toolchain/toolchain-gnueabihf-linux.cmake |   57 +
 55 files changed, 18671 insertions(+)
 create mode 100644 cvimath/.clang-format
 create mode 100644 cvimath/.gitignore
 create mode 100644 cvimath/CMakeLists.txt
 create mode 100644 cvimath/README.md
 create mode 100755 cvimath/clang-format.sh
 create mode 100644 cvimath/include/cvimath.h
 create mode 100644 cvimath/include/cvimath_internal.h
 create mode 100644 cvimath/include/test_cvikernel_util.h
 create mode 100644 cvimath/sample/CMakeLists.txt
 create mode 100644 cvimath/sample/README.md
 create mode 100644 cvimath/sample/sample_bf16_fp32.cpp
 create mode 100644 cvimath/sample/sample_fp32_bf16.cpp
 create mode 100644 cvimath/sample/sample_gemm.cpp
 create mode 100644 cvimath/sample/sample_mask.cpp
 create mode 100644 cvimath/sample/sample_reduce_mul.cpp
 create mode 100644 cvimath/sample/sample_set_val_by_mask.cpp
 create mode 100644 cvimath/sample/sample_sigmoid_linear_interp.cpp
 create mode 100644 cvimath/sample/sample_upsample.cpp
 create mode 100644 cvimath/src/1880v2_fp_convert.c
 create mode 100644 cvimath/src/CMakeLists.txt
 create mode 100644 cvimath/src/bf16_gemm.c
 create mode 100644 cvimath/src/blas_cpu.cpp
 create mode 100644 cvimath/src/chl_quan.cpp
 create mode 100644 cvimath/src/common.c
 create mode 100644 cvimath/src/fp32_bf16_kernel.c
 create mode 100644 cvimath/src/gen_lut.h
 create mode 100644 cvimath/src/set_val_by_mask.c
 create mode 100644 cvimath/src/tiu_lut_atan.c
 create mode 100644 cvimath/src/tiu_lut_atan2.c
 create mode 100644 cvimath/src/tiu_reciprocal.c
 create mode 100644 cvimath/src/tiu_reshape_c.c
 create mode 100644 cvimath/src/tiu_sigmoid.c
 create mode 100644 cvimath/src/tiu_sqrt.c
 create mode 100644 cvimath/src/tiu_upsample.c
 create mode 100644 cvimath/src/util.c
 create mode 100644 cvimath/tests/CMakeLists.txt
 create mode 100644 cvimath/tests/common/test_native_ref.c
 create mode 100644 cvimath/tests/cvi1835/atan.cpp
 create mode 100644 cvimath/tests/cvi1835/atan2_degree.cpp
 create mode 100644 cvimath/tests/cvi1835/atan2_radian.cpp
 create mode 100644 cvimath/tests/cvi1835/bf16_fp32.cpp
 create mode 100644 cvimath/tests/cvi1835/blas_cpu.cpp
 create mode 100644 cvimath/tests/cvi1835/blas_tpu.cpp
 create mode 100644 cvimath/tests/cvi1835/depthwise_reshape_same.cpp
 create mode 100644 cvimath/tests/cvi1835/fp32_bf16.cpp
 create mode 100644 cvimath/tests/cvi1835/gemm.cpp
 create mode 100644 cvimath/tests/cvi1835/mask.cpp
 create mode 100644 cvimath/tests/cvi1835/reciprocal.cpp
 create mode 100644 cvimath/tests/cvi1835/sigmoid_linear_interp.cpp
 create mode 100644 cvimath/tests/cvi1835/sqrt.cpp
 create mode 100644 cvimath/tests/include/test_native_ref.h
 create mode 100644 cvimath/tests/include/test_tf_quant_util.h
 create mode 100644 cvimath/toolchain/toolchain-aarch64-linux.cmake
 create mode 100644 cvimath/toolchain/toolchain-gnueabihf-linux.cmake

diff --git a/.version/2024-05-31.md b/.version/2024-05-31.md
index 80b66efde..ec79e0332 100644
--- a/.version/2024-05-31.md
+++ b/.version/2024-05-31.md
@@ -19,3 +19,4 @@
 | cvibuilder                 | cvibuilder                          | https://github.com/sophgo/cvibuilder.git                 | sg200x-dev    | 4309f2a      |
 | cvikernel                  | cvikernel                           | https://github.com/sophgo/cvikernel.git                  | sg200x-dev    | 9f1f57a      |
 | cviruntime                 | cviruntime                          | https://github.com/sophgo/cviruntime.git                 | sg200x-dev    | 3f49386      |
+| cvimath                    | cvimath                             | https://github.com/sophgo/cvimath.git                    | sg200x-dev    | ce8705f      |
diff --git a/cvimath/.clang-format b/cvimath/.clang-format
new file mode 100644
index 000000000..b64279812
--- /dev/null
+++ b/cvimath/.clang-format
@@ -0,0 +1,108 @@
+---
+Language:        Cpp
+# BasedOnStyle:  Google
+AccessModifierOffset: -1
+AlignAfterOpenBracket: Align
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+AlignEscapedNewlines: Left
+AlignOperands:   true
+AlignTrailingComments: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: true
+AllowShortLoopsOnASingleLine: true
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: true
+AlwaysBreakTemplateDeclarations: true
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:   
+  AfterClass:      false
+  AfterControlStatement: false
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  BeforeCatch:     false
+  BeforeElse:      false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     100
+CommentPragmas:  '^ IWYU pragma:'
+CompactNamespaces: false
+ConstructorInitializerAllOnOneLineOrOnePerLine: true
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DerivePointerAlignment: true
+DisableFormat:   false
+ExperimentalAutoDetectBinPacking: false
+FixNamespaceComments: true
+ForEachMacros:   
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IncludeCategories: 
+  - Regex:           '^<.*\.h>'
+    Priority:        1
+  - Regex:           '^<.*'
+    Priority:        2
+  - Regex:           '.*'
+    Priority:        3
+IncludeIsMainRegex: '([-_](test|unittest))?$'
+IndentCaseLabels: true
+IndentWidth:     2
+IndentWrappedFunctionNames: false
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: false
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBlockIndentWidth: 2
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: false
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 1
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakString: 1000
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 200
+PointerAlignment: Left
+ReflowComments:  true
+SortIncludes:    true
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeParens: ControlStatements
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 2
+SpacesInAngles:  false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+Standard:        Auto
+TabWidth:        8
+UseTab:          Never
+...
+
diff --git a/cvimath/.gitignore b/cvimath/.gitignore
new file mode 100644
index 000000000..cf56d02fa
--- /dev/null
+++ b/cvimath/.gitignore
@@ -0,0 +1,5 @@
+.vscode
+build
+install
+
+
diff --git a/cvimath/CMakeLists.txt b/cvimath/CMakeLists.txt
new file mode 100644
index 000000000..3422abee7
--- /dev/null
+++ b/cvimath/CMakeLists.txt
@@ -0,0 +1,85 @@
+project(cvimath)
+
+cmake_minimum_required(VERSION 3.2.2)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS ON)
+#set(CMAKE_BUILD_WITH_INSTALL_RPATH TRUE)
+#set(CMAKE_INSTALL_RPATH "\${ORIGIN}/../lib;\${ORIGIN}/")
+
+if ("${CMAKE_BUILD_TYPE}" STREQUAL "")
+  set(CMAKE_BUILD_TYPE "Release")
+endif()
+
+if("${CMAKE_TOOLCHAIN_FILE}" STREQUAL "")
+  message("No toolchain file found. Using host compiler.")
+  if ("${CMAKE_INSTALL_PREFIX}" STREQUAL "/usr/local")
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/install")
+  endif()
+else()
+  if ("${CMAKE_INSTALL_PREFIX}" STREQUAL "/usr/local")
+    set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_SOURCE_DIR}/install_soc")
+  endif()
+endif()
+
+set(CMAKE_C_INIT "-fsigned-char -fPIC -Werror=all -fdiagnostics-color=always")
+set(CMAKE_CXX_INIT "-fsigned-char -fPIC -Werror=all -fdiagnostics-color=always -std=gnu++11")
+if("${CMAKE_BUILD_TYPE}" STREQUAL "Release" OR "${CMAKE_BUILD_TYPE}" STREQUAL "RELEASE")
+  set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_INIT} -O3" )
+  set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_INIT} -O3" )
+elseif("${CMAKE_BUILD_TYPE}" STREQUAL "Debug")
+  set( SAFETY_FLAGS "-Werror -Wall -Wextra -ggdb -fno-strict-aliasing")
+  set( SAFETY_FLAGS "${SAFETY_FLAGS} -fsanitize=address")
+  set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CMAKE_C_INIT} -g -O0 ${SAFETY_FLAGS}")
+  set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${CMAKE_CXX_INIT} -g -O0 ${SAFETY_FLAGS}" )
+else()
+  message(FATAL_ERROR "No build type!!!")
+endif()
+
+message("==================================================")
+message("[Summary]")
+message("C   compiler ${CMAKE_C_COMPILER}")
+message("CXX compiler ${CMAKE_CXX_COMPILER}")
+message("Build type   ${CMAKE_BUILD_TYPE}")
+message("Install dir  ${CMAKE_INSTALL_PREFIX}")
+message("==================================================")
+
+# Add externel libs
+set( TPU_LD "-L${TPU_SDK_ROOT}/lib")
+set( TPU_KERNEL_LIB "${TPU_LD} -lcvikernel")
+# wait cvimath/cviruntime so are generated
+set( TEST_LIBS cvimath cviruntime)
+
+# Add include path and set tpu libraries.
+include_directories(
+  ${TPU_SDK_ROOT}/include
+  ${CVI_EXTRA}/include
+  "${CMAKE_CURRENT_SOURCE_DIR}/include")
+
+# https://stackoverflow.com/questions/30250494/ctest-not-detecting-tests
+enable_testing()
+
+# ctest config
+if (NOT CMAKE_CROSSCOMPILING)
+  if (ENABLE_TEST STREQUAL "ON")
+    add_subdirectory(tests)
+  endif()
+endif()
+
+add_subdirectory(src)
+add_subdirectory(sample)
+
+# export header
+file(GLOB HEADERS
+  include/cvimath.h
+  include/cvimath_internal.h
+  include/test_cvikernel_util.h
+  )
+
+# export sample
+#file(GLOB SAMPLES sample/*)
+
+#install(FILES ${SAMPLES} DESTINATION samples/cvimath)
+install(FILES ${CMAKE_SOURCE_DIR}/toolchain/toolchain-aarch64-linux.cmake DESTINATION samples/cvimath)
+install(FILES ${HEADERS} DESTINATION include/cvimath)
diff --git a/cvimath/README.md b/cvimath/README.md
new file mode 100644
index 000000000..7f9f3bd5c
--- /dev/null
+++ b/cvimath/README.md
@@ -0,0 +1,21 @@
+# CviMath
+
+## How to build
+
+### Requirements
+
+1. MLIR SDK
+
+SOC mode
+
+```
+$ mkdir build
+$ cd build
+$ cmake -G Ninja .. -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+                    -DTOOLCHAIN_ROOT_DIR=${PWD}/../../gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu \
+                    -DCMAKE_TOOLCHAIN_FILE=${PWD}/../toolchain/toolchain-aarch64-linux.cmake \
+                    -DCMAKE_BUILD_TYPE=Release \
+                    -DCMAKE_INSTALL_PREFIX= \
+                    -DTPU_SDK_ROOT=
+$ ninja -j8 && ninja install
+```
diff --git a/cvimath/clang-format.sh b/cvimath/clang-format.sh
new file mode 100755
index 000000000..9190a8135
--- /dev/null
+++ b/cvimath/clang-format.sh
@@ -0,0 +1,8 @@
+#!/bin/bash
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+CLANG_ROOT=$(readlink -f $SCRIPT_DIR)
+
+find $CLANG_ROOT/include -regex '.*\.\(cpp\|h\|hpp\|cc\|c\|cxx\|inc\)' -exec clang-format -i {} \;
+find $CLANG_ROOT/src -regex '.*\.\(cpp\|h\|hpp\|cc\|c\|cxx\|inc\)' -exec clang-format -i {} \;
+find $CLANG_ROOT/tests -regex '.*\.\(cpp\|h\|hpp\|cc\|c\|cxx\|inc\)' -exec clang-format -i {} \;
+find $CLANG_ROOT/sample -regex '.*\.\(cpp\|h\|hpp\|cc\|c\|cxx\|inc\)' -exec clang-format -i {} \;
diff --git a/cvimath/include/cvimath.h b/cvimath/include/cvimath.h
new file mode 100644
index 000000000..1d2d1bf8e
--- /dev/null
+++ b/cvimath/include/cvimath.h
@@ -0,0 +1,84 @@
+#ifndef CVIMATH_H
+#define CVIMATH_H
+
+#include <stdint.h>
+
+// public function
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/**
+ * @brief This function calculated the unit length of a precahed i8 feature array
+ *
+ * @param precached Prefetched feature array in 1-D. Format: feature1, feature2, ...
+ * @param unit_precached_arr Output unit length.
+ * @param data_length The length of the feature.
+ * @param data_num The number of features.
+ */
+void cvm_gen_precached_i8_unit_length(int8_t *precached, float *unit_precached_arr,
+                                      const uint32_t data_length, const uint32_t data_num);
+
+/**
+ * @brief This function calculated the unit length of a precahed u8 feature array
+ *
+ * @param precached Prefetched feature array in 1-D. Format: feature1, feature2, ...
+ * @param unit_precached_arr Output unit length.
+ * @param data_length The length of the feature.
+ * @param data_num The number of features.
+ */
+void cvm_gen_precached_u8_unit_length(uint8_t *precached, float *unit_precached_arr,
+                                      const uint32_t data_length, const uint32_t data_num);
+
+/**
+ * @brief Do inner product matching on i8 feature with given precached feature array.
+ *
+ * @param feature The input i8 feature to be compared.
+ * @param precached The precached feature array in 1-D.
+ * @param unit_precached_arr The unit length array of the precached.
+ * @param k_index The output matching index result in order.
+ * @param k_value The output matching value result in order.
+ * @param buffer The buffer used by this function, same length as precached.
+ * @param data_length The length of the single feature.
+ * @param data_num The number of features of the feature array.
+ * @param k Top k results, affects the length of k_index and k_value.
+ */
+void cvm_cpu_i8data_ip_match(int8_t *feature, int8_t *precached, float *unit_precached_arr,
+                             uint32_t *k_index, float *k_value, float *buffer,
+                             const uint32_t data_length, const uint32_t data_num, const uint32_t k);
+
+/**
+ * @brief Do inner product matching on u8 feature with given precached feature array.
+ *
+ * @param feature The input u8 feature to be compared.
+ * @param precached The precached feature array in 1-D.
+ * @param unit_precached_arr The unit length array of the precached.
+ * @param k_index The output matching index result in order.
+ * @param k_value The output matching value result in order.
+ * @param buffer The buffer used by this function, same length as precached.
+ * @param data_length The length of the single feature.
+ * @param data_num The number of features of the feature array.
+ * @param k Top k results, affects the length of k_index and k_value.
+ */
+void cvm_cpu_u8data_ip_match(uint8_t *feature, uint8_t *precached, float *unit_precached_arr,
+                             uint32_t *k_index, float *k_value, float *buffer,
+                             const uint32_t data_length, const uint32_t data_num, const uint32_t k);
+
+// Legacy support for hj.
+inline void __attribute__((always_inline))
+cvm_gen_db_i8_unit_length(int8_t *precached, float *unit_precached_arr, const uint32_t data_length,
+                          const uint32_t data_num) {
+  cvm_gen_precached_i8_unit_length(precached, unit_precached_arr, data_length, data_num);
+}
+
+inline void __attribute__((always_inline))
+cvm_gen_db_unit_length(uint8_t *precached, float *unit_precached_arr, const uint32_t data_length,
+                       const uint32_t data_num) {
+  cvm_gen_precached_u8_unit_length(precached, unit_precached_arr, data_length, data_num);
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // CVIMATH_H
diff --git a/cvimath/include/cvimath_internal.h b/cvimath/include/cvimath_internal.h
new file mode 100644
index 000000000..555b6067b
--- /dev/null
+++ b/cvimath/include/cvimath_internal.h
@@ -0,0 +1,1066 @@
+#ifndef CVIMATH_INTERNAL_H
+#define CVIMATH_INTERNAL_H
+
+#include <stdbool.h>  //bool
+#include <stddef.h>   //size_t
+#include "cvimath.h"
+
+// copy from lagency
+// TODO: move to properly header files
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+typedef uint64_t u64;
+
+typedef int8_t s8;
+typedef int16_t s16;
+typedef int32_t s32;
+typedef int64_t s64;
+
+static inline uint64_t align_up(uint64_t x, uint64_t n) { return (x + n - 1) / n * n; }
+
+/**
+ * please refer @example for more details
+ */
+#include <cvikernel/cvikernel.h>
+
+#define CVK_MULTIPLIER_BIAS_PACKED_DATA_SIZE 9
+#define CVK_MULTIPLIER_ONLY_PACKED_DATA_SIZE 5
+
+// public function
+#ifdef __cplusplus
+extern "C" {
+#endif
+/**
+ * @brief get lookup tabel shape
+ *
+ * @param cvk_ctx kernel structure
+ * @param [out] shape the table shape
+ */
+void cvm_table_shape(cvk_context_t *cvk_ctx, cvk_tl_shape_t *shape);
+
+/**
+ * @brief generate sqrt look up table for bf16 exponent part
+ *
+ * @param [out] table_data bf16 exponent part lookup table in host
+ * @param table_shape table shape
+ */
+void cvm_gen_sqrt(uint16_t *table_data, cvk_tl_shape_t *table_shape);
+
+/**
+ * @brief syntactic sugar for cvm_gen_sqrt/cvm_gen_sqrt_mantissa
+ *
+ * @param [out] sqrt_table_data bf16 exponent part lookup table in host
+ * @param [out] sqrt_table_data_mantissa bf16 fraction part lookup table in host
+ * @param table_shape table shape
+ */
+void cvm_sqrt_tbl(uint16_t *sqrt_table_data, uint16_t *sqrt_table_data_mantissa,
+                  cvk_tl_shape_t *table_shape);
+
+/**
+ * @brief generate sqrt look up table for bf16 fraction part
+ *
+ * @param [out] table_mantissa bf16 fraction part lookup table in host
+ * @param table_shape table shape
+ */
+void cvm_gen_sqrt_mantissa(uint16_t *table_mantissa, cvk_tl_shape_t *table_shape);
+
+/**
+ * @brief implement sqrt in tpu memory
+ *
+ * @param cvk_ctx kernel structure
+ * @param tl_ifmap input tensor in tpu memory
+ * @param tl_buf working buffer
+ * @param tbl_answer lookup table tensor for bf16 exponent part in tpu memory
+ * @param tbl_answer_mantissa lookup table tensor for fraction part in tpu memory
+ * @param [out] tl_ofmap_bf16 result in in memory
+ *
+ * @example
+ *  // 1. alloc in tpu memory
+ *  // 2. prepare table
+ *  cvm_sqrt_tbl(table_data, table_data_mantissa, &table_shape);
+ *  // 3. put host data to tpu memory
+ *  // 4. prepare command buffer
+ *  cvm_emit_sqrt(cvk_ctx, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_mantissa,
+ *                tl_ofmap_bf16);
+ *  // 5. submit it
+ *  test_submit_comp(rt_ctx, cvk_ctx);
+ *
+ *  // 6. get result from tpu memory
+ *  uint16_t* result = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16,
+ * tl_ofmap_bf16->fmt);
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_emit_sqrt(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                  cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa, cvk_tl_t *tl_ofmap_bf16);
+
+/**
+ * @brief generate reciprocal look up table for bf16 exponent part
+ *
+ * @param [out] table_data bf16 exponent part lookup table in host
+ * @param table_shape table shape
+ */
+void cvm_gen_reciprocal(uint16_t *table_data, cvk_tl_shape_t *table_shape);
+
+/**
+ * @brief generate reciprocal look up table for bf16 fraction part
+ *
+ * @param [out] table_mantissa bf16 fraction part lookup table in host
+ * @param table_shape table shape
+ */
+void cvm_gen_reciprocal_mantissa(uint16_t *table_mantissa, cvk_tl_shape_t *table_shape);
+
+/**
+ * @brief syntactic sugar for cvm_gen_reciprocal/cvm_gen_reciprocal_mantissa
+ *
+ * @param [out] sqrt_table_data bf16 exponent part lookup table in host
+ * @param [out] sqrt_table_data_mantissa bf16 fraction part lookup table in host
+ * @param table_shape table shape
+ */
+void cvm_reciprocal_tbl(uint16_t *table_data, uint16_t *table_mantissa,
+                        cvk_tl_shape_t *table_shape);
+
+/**
+ * @brief implement reciprocal in tpu memory
+ *
+ * @param cvk_ctx kernel structure
+ * @param tl_ifmap input tensor in tpu memory
+ * @param tl_buf working buffer
+ * @param tbl_answer lookup table tensor for bf16 exponent part in tpu memory
+ * @param tbl_answer_mantissa lookup table tensor for fraction part in tpu memory
+ * @param [out] tl_ofmap_bf16 result in in memory
+ *
+ * @example
+ *  int align = 1; // align eu(excution unit)
+ *  // 1. alloc in tpu memory
+ *  // 2. prepare table
+ *  cvm_reciprocal_tbl(table_data, table_data_mantissa, &table_shape);
+ *  // 3. put host data to tpu memory
+ *  // 4. prepare command buffer
+ *  cvm_emit_reciprocal(cvk_ctx, tl_ifmap, tl_buf, cvk_tl_table_answer,
+ * cvk_tl_table_answer_mantissa, tl_ofmap_bf16);
+ *
+ *  // 5. submit it
+ *  test_submit_comp(rt_ctx, cvk_ctx);
+ *
+ *  // 6. get result from tpu memory
+ *  uint16_t* result = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16,
+ * tl_ofmap_bf16->fmt);
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_emit_reciprocal(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                        cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa,
+                        cvk_tl_t *tl_ofmap_bf16);
+
+/**
+ * @brief generate sigmoid lookup table in host,
+ * we leverage Linear interpolation fairly close to the original
+ * you can refer [wiki](https://en.wikipedia.org/wiki/Interpolation) for more details
+ *
+ * @param [out] sigmoid_table_data lookup table in host
+ * @param [out] sigmoid_table_data_slope slope table in host
+ * @param table_shape table shape
+ * @param range_start quantize range from,
+ * e.g: the original input range is -127 ~ 128, we quantize to -8 ~ 8
+ * than -8 is our \range_start and 8 is \range_end
+ * @param range_end quantize range end
+ */
+void cvm_sigmoid_tbl(uint16_t *sigmoid_table_data, uint16_t *sigmoid_table_data_slope,
+                     cvk_tl_shape_t *table_shape, int range_start, int range_end);
+
+/**
+ * @brief get scale factor from \range_start and \range_end
+ *
+ * @param range_start quantize range from
+ * @param range_end quantize range end
+ *
+ * @return scale factor
+ */
+float cvm_sigmoid_scale(int range_start, int range_end);
+
+/**
+ * @brief get sigmoid value by linear interpolation
+ *
+ * @param cvk_ctx kernel structure
+ * @param tl_ifmap input tensor in tpu memory
+ * @param tl_buf working buffer
+ * @param tl_table_answer sigmoid table in tpu memory generated by \cvm_sigmoid_tbl
+ * @param tl_table_answer_slope sigmoid slope table in tpu memory generated by \cvm_sigmoid_tbl
+ * @param [out] tl_ofmap_bf16 result in in memory
+ * @param scale scale factor generated by \cvm_sigmoid_scale
+ *
+ * @example
+ *  // 1. alloc in tpu memory
+ *  // 2. prepare table
+ *  cvm_sigmoid_tbl(table_data, table_data_slope, &table_shape, range_start, range_end);
+ *  float scale = cvm_sigmoid_scale(range_start, range_end);
+ *  // 3. put host data to tpu memory
+ *  // 4. prepare command buffer
+ *  cvm_emit_sigmoid(cvk_ctx, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_slope,
+ *                   tl_ofmap_bf16, scale);
+ *  // 5. submit it
+ *  test_submit_comp(rt_ctx, cvk_ctx);
+ *
+ *  // 6. get result from tpu memory
+ *  uint16_t* result = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16,
+ * tl_ofmap_bf16->fmt);
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_emit_sigmoid(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                     cvk_tl_t *tl_table_answer, cvk_tl_t *tl_table_answer_slope,
+                     cvk_tl_t *tl_ofmap_bf16, float scale);
+
+/**
+ * @brief General Matrix Multiplication
+ * that equal \lhs_gaddr * \rhs_gaddr = \dest_gaddr
+ *
+ * @param cvk_ctx kernel structure
+ * @param lhs_gaddr left hand side device memory address
+ * @param rhs_gaddr right hand side device memory address
+ * @param dest_gaddr destination device memory address
+ * @param in_row \lhs_gaddr matrix row
+ * @param in_col \lhs_gaddr matrix col
+ * @param out_col \dest_gaddr matrix col
+ * @param fmt the possible value is \CVK_FMT_BF16 or \CVK_FMT_I8 or \CVK_FMT_U8
+ * @example
+ *
+ * // 1. alloc host memory and put it to device memory
+ * // M=in_row K=in_col N=out_col
+ * cvk_mg_t *mg_A = _test_put_matrix_g(&ctx, M, K, CVK_FMT_BF16, (uint8_t *)bf16_A);
+ * cvk_mg_t *mg_B = _test_put_matrix_g(&ctx, K, N, CVK_FMT_BF16, (uint8_t *)bf16_B);
+ * cvk_mg_t *mg_R = _test_put_matrix_g(&ctx, M * 2, N, CVK_FMT_BF16, (uint8_t *)bf16_R);
+ *
+ * // 2. get device address for gemm
+ * gaddr_t gaddr_a = mg_A->start_address;
+ * gaddr_t gaddr_b = mg_B->start_address;
+ * gaddr_t gaddr_r = mg_R->start_address;
+ *
+ * // 3. prepare gemm descriptor
+ * cvm_gemm(cvk_ctx, gaddr_a, gaddr_b, gaddr_r, M, K, N);
+ *
+ * // 4. submit descriptor
+ * test_submit_comp(&ctx, cvk_ctx);
+ *
+ * // 5. get result from device to host
+ * uint16_t *bf16_ref = (uint16_t *)test_get_mg_mem_comp(&ctx, mg_R);
+ *
+ * @ return slice_num array of {M, N, K}
+ */
+size_t *cvm_gemm(cvk_context_t *cvk_ctx, uint64_t lhs_gaddr, uint64_t rhs_gaddr,
+                 uint64_t dest_gaddr, int in_row, int in_col, int out_col, cvk_fmt_t fmt);
+
+/**
+ * @brief combine \cvm_gemm int8 result to int32
+ * the raw output is seperate 32bit result info 4 part with bstride
+ * and we need to 'combine' it to human readable
+ * for instance, the following is the raw result
+ * lsb             31               msb
+ * 0x1 0x2 0x3 0x4 0x5 0x6 0x7 0x8
+ * 0x9 0xa 0xb 0xc 0xd 0xe 0xf 0x0
+ * 0x11 0x12 0x13 0x14 0x15 0x16 0x17 0x18
+ * 0x19 0x20 0x21 0x22 0x23 0x24 0x25 0x26
+ *
+ * the value by strategy could be column major:
+ * 1. 0x19110901
+ * 2. 0x20120a02
+ * 3. 0x21130b03
+ * and so on
+ *
+ * @param cvm_gemm_strategy return strategy value from \cvm_gemm
+ * @param cvm_output raw result from \cvm_gemm
+ * @param [out] i32_R int32 result
+ * @param M row of output matrix
+ * @param N column of output matrix
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_combin_gemm_i8(size_t *cvm_gemm_strategy, uint8_t *cvm_output, uint32_t *i32_R, int M,
+                       int N);
+/**
+ * @brief fp32 to bf16 format int device memory
+ *
+ * @param cvk_ctx kernel structure
+ * @param gaddr_fp32 fp32 data with device memory address
+ * @param fp32_shape fp32 tensor shape
+ * @param [out] gaddr_bf16 bf16 data with device memory address
+ * @param bf16_shape bf16 tensor shape
+ * @param fmt tensor format such as \CVK_FMT_BF16
+ *
+ * @example
+ *
+ * cvk_tl_shape_t s = {1, 2, 3, 4}
+ * // 1. put fp32 to device memory
+ * test_put_tg_mem_comp(rt_ctx, tg_with_fp32, data)
+ * // 2. init bf16 tg
+ * // 3. prepare command buffer
+ * cvm_s2s_fp32_bf16(cvk_ctx, tg_with_fp32->start_address, tg_with_fp32->shape,
+ * tg_with_bf16->start_address, tg_with_bf16->shape, CVK_FMT_BF16);
+ * // 4. submit it
+ * test_submit_comp(rt_ctx, cvk_ctx);
+ * // 5. get result from device memory
+ * uint16_t *dst_data = (uint16_t *)test_get_tg_mem_comp(rt_ctx, tg_with_bf16);
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_s2s_fp32_bf16(cvk_context_t *cvk_ctx, uint64_t gaddr_fp32, cvk_tg_shape_t fp32_shape,
+                      uint64_t gaddr_bf16, cvk_tg_shape_t bf16_shape, cvk_fmt_t fmt);
+
+/**
+ * @brief generate lookup table for check input is 0 or not
+ *
+ * @param [out] table_0 lookup table for 0 or not
+ * @param table_shape table shape
+ */
+void cvm_gen_0_tbl(uint16_t *table_0, cvk_tl_shape_t *table_shape);
+
+// mask function
+/**
+ * @brief get mask value that seperate 0 or not
+ * e.g: input = [0, 1, -1, 2] output [1, 0, 0, 0]
+ * please see \cvm_emit_mask for more details
+ *
+ * @param cvk_ctx kernel structure
+ * @param tl_ifmap input in tpu memory
+ * @param tl_buf working buffer
+ * @param tbl_answer lookup table for 0 or not in tpu memory, generate by \cvm_gen_0_tbl
+ * @param [out] tl_ofmap_bf16 mask result in tpu memory
+ * @param fmt tensor format such as \CVK_FMT_BF16
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_emit_0_idx(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                   cvk_tl_t *tbl_answer, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+
+/**
+ * @brief get mask value that check < 0
+ * e.g: input = [0, 10, 6, -1, 0] output [0, 0, 0, 1, 0]
+ * please see \cvm_emit_mask for more details
+ *
+ * @param cvk_ctx kernel structure
+ * @param tl_ifmap input in tpu memory
+ * @param tl_buf working buffer
+ * @param tl_pos_neg_buf lookup table generate from \cvm_pos_neg_tbl
+ * @param [out] tl_ofmap_bf16 mask result in tpu memory
+ * @param fmt tensor format such as \CVK_FMT_BF16
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_emit_neg_idx(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                     cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+
+/**
+ * @brief get mask value that check >= 0
+ * e.g: input = [0, 10, 6, -1, 0] output [0, 1, 1, 0, 0]
+ * please see \cvm_emit_mask for more details
+ *
+ * @param cvk_ctx kernel structure
+ * @param tl_ifmap input in tpu memory
+ * @param tl_buf working buffer
+ * @param tl_pos_neg_buf lookup table generate from \cvm_pos_neg_tbl
+ * @param [out] tl_ofmap_bf16 mask result in tpu memory
+ * @param fmt tensor format such as \CVK_FMT_BF16
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_emit_pos_idx(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                     cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+
+/**
+ * @brief invert 0/1 input
+ * e.g: input = [0, 1, 1, 1, 0] output [1, 0, 0, 0, 1]
+ *
+ * @param cvk_ctx kernel structure
+ * @param tl_ifmap input in tpu memory
+ * @param tl_buf working buffer
+ * @param [out] tl_ofmap_bf16 mask result in tpu memory
+ * @param fmt
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_emit_0_1_revert_input(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                              cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+
+// mask enum define
+enum CVM_MASK_TYPE {
+  CVM_MASK_TYPE_GT_0 = 0,  // remain >  0
+  CVM_MASK_TYPE_GE_0,      // remain >= 0
+  CVM_MASK_TYPE_EQ_0,      // remain  = 0
+  CVM_MASK_TYPE_LT_0,      // remain <  0
+  CVM_MASK_TYPE_LE_0,      // remain <= 0
+  CVM_MASK_MAX
+};
+
+/**
+ * @brief get mask for \CVM_MASK_TYPE case
+ *
+ * @param cvk_ctx kernel structure
+ * @param tl_ifmap input in tpu memory
+ * @param tl_buf working buffer
+ * @param tl_buf2 working buffer
+ * @param tl_buf3 working buffer
+ * @param tl_pos_neg_table lookup table generate from \cvm_pos_neg_tbl
+ * @param tl_0_idx_table lookup table for 0 or not in tpu memory generated by \cvm_gen_0_tbl
+ * @param [out] tl_ofmap_bf16 mask result in tpu memory
+ * @param fmt tensor format such as \CVK_FMT_BF16
+ * @param mask \CVM_MASK_TYPE
+ *
+ * @example
+ *  // 1. alloc in tpu memory
+ *  // 2. prepare table
+ *  cvm_gen_0_tbl(idx_0_table_data, &table_shape);
+ *  cvm_pos_neg_tbl(table_data_atan_pos_neg, &table_shape);
+ *  // 3. put host data to tpu memory
+ *  // 4. prepare command buffer
+ *  cvm_emit_mask(cvk_ctx,
+ *                tl_ifmap,                        // input
+ *                tl_buf, tl_buf2, tl_buf4,        // tmp buffer
+ *                tl_pos_neg_buf, tl_0_idx_table,  // lookup table
+ *                tl_ofmap_bf16,                   // output
+ *                fmt, mode);
+ *
+ *  // 5. submit it
+ *  test_submit_comp(rt_ctx, cvk_ctx);
+ *
+ *  // 6. get result from tpu memory
+ *  uint16_t* result = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16,
+ * tl_ofmap_bf16->fmt);
+ *
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_emit_mask(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2,
+                  cvk_tl_t *tl_buf3, cvk_tl_t *tl_pos_neg_table, cvk_tl_t *tl_0_idx_table,
+                  cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt, enum CVM_MASK_TYPE mask);
+
+/**
+ * @brief generate lookup table for atan by degree
+ *
+ * @param [out] table_data_y0 atan by degree lookup table in host
+ * @param table_shape table shape
+ */
+void cvm_atan_fast_degree_y0(uint16_t *table_data_y0, cvk_tl_shape_t *table_shape);
+
+/**
+ * @brief generate lookup table for check value of absolute in [0,1] or > 1
+ * atan2 used, [0-1] indicate 1, > 1 indicate with -1
+ *
+ * @param [out] table_invert lookup table in host
+ * @param table_shape table shape
+ */
+void cvm_atan_s_01(uint16_t *table_invert, cvk_tl_shape_t *table_shape);
+
+/**
+ * @brief generate table for check input value is positive(>=0) or negtive(<0)
+ * by lookup table, 'pos_neg' means data is positive(>=0) is 1 or negtive(<0) is -1
+ *
+ * @param [out] table_pos_neg lookup table in host
+ * @param table_shape table shape
+ */
+void cvm_pos_neg_tbl(uint16_t *table_pos_neg, cvk_tl_shape_t *table_shape);
+
+// deprecated code from \cvm_pos_neg_tbl
+void cvm_atan_pos_neg(uint16_t *table_pos_neg, cvk_tl_shape_t *table_shape);
+
+/**
+ * @brief generate atan answer by lookup table,
+ * plz refer [git](https://github.com/xiezhq-hermann/atan_lookup) for more details
+ *
+ * @param [out] table_data_y0 atan answer lookup table in host
+ * @param table_shape table shape
+ */
+void cvm_atan_y0(uint16_t *table_data_y0, cvk_tl_shape_t *table_shape);
+
+/**
+ * @brief generate atan slope data, for more accuracy
+ *
+ * @param [out] table_slope atan slope lookup table in host
+ * @param table_shape table shape
+ */
+void cvm_atan_slope(uint16_t *table_slope, cvk_tl_shape_t *table_shape);
+
+/**
+ * @brief syntactic sugar for cvm_atan_y0/cvm_atan_slope/cvm_atan_s_01/cvm_pos_neg_tbl
+ *
+ * @param [out] table_data_atan_y0 atan answer lookup table in host
+ * @param [out] table_data_atan_slope atan slope lookup table in host
+ * @param [out] table_data_atan_invert lookup table in host
+ * @param [out] table_data_atan_pos_neg lookup table in host
+ * @param table_shape table shape
+ */
+void cvm_atan_tbl(uint16_t *table_data_atan_y0, uint16_t *table_data_atan_slope,
+                  uint16_t *table_data_atan_invert, uint16_t *table_data_atan_pos_neg,
+                  cvk_tl_shape_t *table_shape);
+
+/**
+ * @brief implement atan in tpu memory
+ *
+ * @param cvk_ctx kernel structure
+ * @param tl_ifmap input tensor in tpu memory
+ * @param tl_buf working buffer
+ * @param tl_buf2 working buffer
+ * @param tl_buf3 working buffer
+ * @param tl_y0_buf atan lookup table in tpu memory
+ * @param tl_slope_buf atan slope lookup table in tpu memory
+ * @param tl_invert_buf lookup table in tpu memory
+ * @param tl_pos_neg_buf lookup table in memory
+ * @param tl_table_answer reciprocal for bf16 exponent part in tpu memory
+ * @param tl_table_answer_mantissa reciprocal for bf16 fraction part in tpu memory
+ * @param [out] tl_ofmap_bf16 result in tpu memory
+ * @param fmt tensor format such as \CVK_FMT_BF16
+ *
+ * @example
+ * // 1. alloc in tpu memory
+ * // 2.1. get reciprocal table in host
+ * cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape);
+ * // 2.2. get atan table in host
+ * cvm_atan_tbl(table_data_atan_y0, table_data_atan_slope, table_data_atan_invert,
+ *              table_data_atan_pos_neg, &table_shape);
+ * // 3. put host data to tpu memory
+ * // 4. prepare command buffer
+ * cvm_atan_emit(cvk_ctx, tl_ifmap, tl_buf, tl_buf2, tl_buf4, tl_y0_buf,
+ * tl_slope_buf, tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer,
+ * tl_reciprocal_table_answer_mantissa, tl_ofmap_bf16, fmt);
+ *
+ * // 5. submit it
+ * test_submit_comp(rt_ctx, cvk_ctx);
+ * // 6. get result from tpu memory
+ * uint16_t *ofmap_data = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16,
+ * tl_ofmap_bf16->fmt);
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_atan_emit(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2,
+                  cvk_tl_t *tl_buf3, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_slope_buf,
+                  cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer,
+                  cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+// atan2 function
+/**
+ * @brief syntactic sugar for generate atan in degree lookup table in
+ * host/cvm_atan_s_01/cvm_pos_neg_tbl
+ *
+ * @param [out] table_data_atan_y0 atan answer lookup table in host
+ * @param [out] table_data_atan_invert lookup table in host
+ * @param [out] table_data_atan_pos_neg lookup table in host
+ * @param table_shape table shape
+ */
+void cvm_atan_fast_degree_tbl(uint16_t *table_data_atan_y0, uint16_t *table_data_atan_invert,
+                              uint16_t *table_data_atan_pos_neg, cvk_tl_shape_t *table_shape);
+
+/**
+ * @brief implement atan2 by degree in tpu memory, implemented by atan. you can refer
+ * [wiki](https://en.wikipedia.org/wiki/Atan2) for more details
+ *
+ * @param cvk_ctx kernel structure
+ * @param y input tensor in tpu memory
+ * @param x input tensor in tpu memory
+ * @param tl_buf working buffer
+ * @param tl_buf2 working buffer
+ * @param tl_buf3 working buffer
+ * @param tl_y0_buf atan2 lookup table in tpu memory
+ * @param tl_invert_buf lookup table in tpu memory
+ * @param tl_pos_neg_buf lookup table in memory
+ * @param tl_table_answer reciprocal for bf16 exponent part in tpu memory
+ * @param tl_table_answer_mantissa reciprocal for bf16 fraction part in tpu memory
+ * @param [out] tl_ofmap_bf16 result in tpu memory
+ * @param fmt tensor format such as \CVK_FMT_BF16
+ *
+ * @example
+ * // 1. alloc in tpu memory
+ * // 2.1. get reciprocal table in host
+ * cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape);
+ * // 2.2. get atan table in host
+ * cvm_atan_fast_degree_tbl(table_data_atan_y0, table_data_atan_slope, table_data_atan_invert,
+ *              table_data_atan_pos_neg, &table_shape);
+ * // 3. put host data to tpu memory
+ * // 4. prepare command buffer
+ * cvm_atan2_fast_degree_emit(
+ *    cvk_ctx, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf,
+ *    tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer,
+ *    tl_reciprocal_table_answer_mantissa, tl_ofmap_bf16, fmt);
+ *
+ * // 5. submit it
+ * test_submit_comp(rt_ctx, cvk_ctx);
+ * // 6. get result from tpu memory
+ * uint16_t *ofmap_data = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16,
+ * tl_ofmap_bf16->fmt);
+ */
+void cvm_atan2_fast_degree_emit(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf,
+                                cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_y0_buf,
+                                cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf,
+                                cvk_tl_t *tl_table_answer, cvk_tl_t *tl_table_answer_mantissa,
+                                cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+/**
+ * @brief implement atan2 in tpu memory, implemented by atan. you can refer
+ * [wiki](https://en.wikipedia.org/wiki/Atan2) for more details
+ *
+ * @param cvk_ctx kernel structure
+ * @param y input tensor in tpu memory
+ * @param x input tensor in tpu memory
+ * @param tl_buf working buffer
+ * @param tl_buf2 working buffer
+ * @param tl_buf3 working buffer
+ * @param tl_y0_buf atan2 lookup table in tpu memory
+ * @param tl_invert_buf lookup table in tpu memory
+ * @param tl_pos_neg_buf lookup table in memory
+ * @param tl_table_answer reciprocal for bf16 exponent part in tpu memory
+ * @param tl_table_answer_mantissa reciprocal for bf16 fraction part in tpu memory
+ * @param [out] tl_ofmap_bf16 result in tpu memory
+ * @param fmt tensor format such as \CVK_FMT_BF16
+ *
+ * @example
+ * // 1. alloc in tpu memory
+ * // 2.1. get reciprocal table in host
+ * cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape);
+ * // 2.2. get atan table in host
+ * cvm_atan_fast_degree_tbl(table_data_atan_y0, table_data_atan_slope, table_data_atan_invert,
+ *              table_data_atan_pos_neg, &table_shape);
+ * // 3. put host data to tpu memory
+ * // 4. prepare command buffer
+ * cvm_atan2_fast_degree_emit(
+ *    cvk_ctx, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf,
+ *    tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer,
+ *    tl_reciprocal_table_answer_mantissa, tl_ofmap_bf16, fmt);
+ *
+ * // 5. submit it
+ * test_submit_comp(rt_ctx, cvk_ctx);
+ * // 6. get result from tpu memory
+ * uint16_t *ofmap_data = (uint16_t *)get_bf16_tensor_l2g(rt_ctx, cvk_ctx, tl_ofmap_bf16,
+ * tl_ofmap_bf16->fmt);
+ */
+void cvm_atan2_merge_emit(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf,
+                          cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_y0_buf,
+                          cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf,
+                          cvk_tl_t *tl_table_answer, cvk_tl_t *tl_table_answer_mantissa,
+                          cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+
+/**
+ * @brief get lookup table size for host alloc mamory used
+ *
+ * @param cvk_ctx kernel structure
+ * @param table_shape table shape
+ * @param fmt tensor format such as \CVK_FMT_BF16
+ *
+ * @return table size in bytes
+ */
+uint64_t cvm_lut_tbl_bytesize(cvk_context_t *cvk_ctx, cvk_tl_shape_t *table_shape, cvk_fmt_t fmt);
+
+/**
+ * @brief calculate new proper reshape channel for depthwise
+ * current only support batch = 1
+ *
+ * @param cvk_ctx kernel structure
+ * @param ic origin input shape of c
+ * @param ih origin input shape of h
+ * @param iw origin input shape of w
+ * @param kh origin kerenl shape of h
+ * @param kw origin kerenl shape of w
+ * @param pad_right padding right with input
+ * @param pad_left padding left with input
+ * @param stride_h stride h with input
+ * @param stride_w stride w with input
+ * @param [out] tl_load_shape shape structure for input in tpu memory
+ * @param [out] new_tl_ifmap_stride deprecated that stride for input in tpu memory
+ * @param [out] new_tg_ifmap_shape shape structure for input in device memory
+ * @param [out] new_tg_ifmap_stride stride structure for input in device memory
+ * @param [out] new_tl_weight_shape reshape weight in tpu memory
+ * @param [out] new_tl_bias_shape reshape bias in tpu memory
+ * @param [out] new_tl_ofmap_shape reshape output in tpu memory
+ * @param fmt the possible value is \CVK_FMT_BF16 or \CVK_FMT_I8 or \CVK_FMT_U8
+ * @param eu_align currently MUST set 1 is force align with hardware
+ *
+ * @example
+ * int align = 1; // force align
+ * cvk_tiu_depthwise_pt_convolution_param_t *p;
+ * // 1. get reshaped shape
+ * int r = cvm_reshape_channel_same(
+ *    cvk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, stride_h, stride_w,
+ *    &tl_load_shape, &tl_load_stride, &tg_shape, &tg_stride, &tl_weight_shape,
+ *    &tl_bias_shape, &tl_output_shape, fmt, align);
+ * // reshape fail
+ * if (r == -1) {
+ *   return -1;
+ * }
+ *
+ * // 2.1 load input
+ * // load input into tpu memory
+ * int load_align = 0; // not align for pack
+ * tmp_tl_load = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_load_shape, fmt, load_align);
+ * tmp_tg = test_alloc_tg_mem_comp(&rt_ctx, cvk_ctx, tg_shape, fmt);
+ * tmp_tg->stride = tg_stride;
+
+ * // int8
+ * cvk_tdma_g2l_tensor_copy_param_t p1;
+ * cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p1);
+ * test_submit_comp(&rt_ctx, cvk_ctx);
+ * test_free_tg_mem_comp(&rt_ctx, tmp_tg);
+
+
+ * // fit for hw
+ * int align_in_tl = 1;
+ * tmp_tl_load->stride = bmk1880v2_tensor_lmem_default_stride(
+ *     cvk_ctx, tmp_tl_load->shape, fmt, align_in_tl);
+ * p->ifmap = tmp_tl_load;
+
+ * // 2.2 prepare load bias, put to tg and load back
+ * if (has_bias) {
+ *   // bias must i8
+ *   int no_bias_align = 0;
+ *   p->bias = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_bias_shape, fmt, no_bias_align);
+ *
+ *   // duplicate bias and replace old
+ *   uint32_t *new_bias = cvm_reshape_channel_weight(
+ *       (uint8_t *)bias, tl_bias_shape.n, tl_bias_shape.c, tl_bias_shape.h,
+ *       tl_bias_shape.w, org_oc, fmt);
+ *
+ *   test_put_tensor_g2l_comp(&rt_ctx, cvk_ctx, p->bias, bias);
+ * }
+ *
+ * // 2.3 prepare load weight, put to tg and load back
+ * {
+ *   int weight_align = 1;
+ *   p->weight = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_weight_shape, fmt, weight_align);
+ *   // duplicate kernel with c
+ *   uint8_t *new_weight = cvm_reshape_channel_weight(
+ *       (uint8_t *)weight, tl_weight_shape.n, tl_weight_shape.c, tl_weight_shape.h,
+ *       tl_weight_shape.w, org_oc, fmt);
+ *
+ *   test_put_tensor_g2l_comp(&rt_ctx, cvk_ctx, p->weight, (u16 *)weight);
+ * }
+ *
+ * // 2.4 prepard ofmap
+ * {
+ *   // we allocate 'same' mode shape
+ *   int output_align = 1; // hw need
+ *   p->ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_output_shape, fmt, output_align);
+ * }
+ *
+ * // 3. prepare command buffer
+ * cvk_ctx->ops->tiu_pt_depthwise_convolution(cvk_ctx, p);
+ *
+ * // 4. submit it
+ * test_submit_comp(rt_ctx, cvk_ctx);
+ *
+ * // 5. get result from tpu memory
+ * output = test_get_tensor_l2g_comp(&rt_ctx, cvk_ctx, p->ofmap, fmt);
+ *
+ * @return status, -1 means fail, other means reshape slice success
+ */
+int cvm_reshape_channel_same(cvk_context_t *cvk_ctx, int ic, int ih, int iw, int kh, int kw,
+                             int pad_right, int pad_left, int stride_h, int stride_w,
+                             cvk_tl_shape_t *tl_load_shape, cvk_tl_stride_t *new_tl_ifmap_stride,
+                             cvk_tg_shape_t *new_tg_ifmap_shape,
+                             cvk_tg_stride_t *new_tg_ifmap_stride,
+                             cvk_tl_shape_t *new_tl_weight_shape, cvk_tl_shape_t *new_tl_bias_shape,
+                             cvk_tl_shape_t *new_tl_ofmap_shape, cvk_fmt_t fmt, int eu_align);
+
+/**
+ * @brief re-construct bias content by reshape channel
+ *
+ * @param bias original bias in host memory
+ * @param ni reshape bias shape of n
+ * @param ci reshape bias shape of c
+ * @param hi reshape bias shape of h
+ * @param wi reshape bias shape of w
+ * @param old_bias_c origin bias shape of c
+ * @param fmt the possible value is \CVK_FMT_BF16 or \CVK_FMT_I8 or \CVK_FMT_U8
+ *
+ * @return bias host data
+ */
+uint32_t *cvm_reshape_channel_bias(uint8_t *bias, int ni, int ci, int hi, int wi, int old_bias_c,
+                                   cvk_fmt_t fmt);
+
+/**
+ * @brief re-construct weight content by reshape channel
+ *
+ * @param weight original bias in host memory
+ * @param ni reshape weight shape of n
+ * @param ci reshape weight shape of c
+ * @param hi reshape weight shape of h
+ * @param wi reshape weight shape of w
+ * @param old_weight_c origin weight shape of c
+ * @param fmt the possible value is \CVK_FMT_BF16 or \CVK_FMT_I8 or \CVK_FMT_U8
+ *
+ * @return weight host data
+ */
+uint8_t *cvm_reshape_channel_weight(uint8_t *weight, int ni, int ci, int hi, int wi,
+                                    int old_weight_c, cvk_fmt_t fmt);
+
+typedef struct cvm_tiu_atan2_param {
+  cvk_tl_t *a;
+  cvk_tl_t *b;
+  cvk_tl_t *res;
+  cvk_tl_t *buf1;
+  cvk_tl_t *buf2;
+  cvk_tl_t *buf3;
+  cvk_tl_t *buf4;
+  cvk_tl_t *buf5;
+  cvk_tl_t *buf6;
+  cvk_tl_t *y0;
+  cvk_tl_t *slope;
+  cvk_tl_t *invert;
+  cvk_tl_t *pos_neg_table;
+  cvk_tl_t *reciprocal_table_answer;
+  cvk_tl_t *reciprocal_table_answer_mantissa;
+  cvk_tl_t *sqrt_table_answer;
+  cvk_tl_t *sqrt_table_answer_mantissa;
+  cvk_tl_t *idx_0_table;
+  cvk_fmt_t fmt;
+  bool output_degree;
+} cvm_tiu_atan2_param_t;
+
+typedef struct cvk_tiu_mask_param {
+  cvk_tl_t *ifmap;
+  cvk_tl_t *ofmap;
+  cvk_tl_t *buf;
+  cvk_tl_t *buf2;
+  cvk_tl_t *buf3;
+  cvk_tl_t *pos_neg_table;
+  cvk_tl_t *idx_0_table;
+  cvk_fmt_t fmt;
+} cvm_tiu_mask_param_t;
+
+typedef struct cvm_tiu_sigmoid_param {
+  float scale;
+  cvk_tl_t *ifmap;
+  cvk_tl_t *buf;
+  cvk_tl_t *table_answer;
+  cvk_tl_t *table_answer_slope;
+  cvk_tl_t *ofmap;
+} cvm_tiu_sigmoid_param_t;
+
+typedef struct cvm_tiu_sqrt_param {
+  cvk_tl_t *a;
+  cvk_tl_t *res;
+  cvk_tl_t *buf;
+  cvk_tl_t *sqrt_table_answer;
+  cvk_tl_t *sqrt_table_answer_mantissa;
+} cvm_tiu_sqrt_param_t;
+
+/**
+ * @brief get \quantized_multiplier and its \right_shift,
+ * please refer
+ * \https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/kernels/internal/common.h:MultiplyByQuantizedMultiplier
+ * for more details
+ *
+ * @param real_multiplier
+ * @param quantized_multiplier
+ * @param right_shift
+ */
+void cvm_get_chl_quan(float real_multiplier, uint32_t *quantized_multiplier, int *right_shift);
+
+/**
+ * @brief
+ *
+ * @param c
+ * @param quantized_multiplier
+ * @param right_shift
+ * @param cal_data
+ * @param bias_data
+ * @param has_bias
+ */
+void cvm_fill_chl_quan_data(const uint32_t c, const uint32_t quantized_multiplier,
+                            const int right_shift, uint8_t *cal_data, int32_t *bias_data,
+                            bool has_bias);
+
+/**
+ * @brief
+ *
+ * @param c
+ * @param quantized_multiplier
+ * @param right_shift
+ * @param bias_data
+ * @param has_bias
+ *
+ * @return
+ */
+uint8_t *cvm_get_chl_quan_data(const uint32_t c, const uint32_t quantized_multiplier,
+                               const int right_shift, int32_t *bias_data, bool has_bias);
+
+/**
+ * @brief get byte size of input \fmt
+ *
+ * @param fmt \cvk_fmt_t structure
+ *
+ * @example
+ * int sz = cvm_bytesize_of_fmt(CVK_FMT_BF16);
+ * assert (sz == 2 && "bf16 takes 2 bytes")
+ *
+ * sz = cvm_bytesize_of_fmt(CVK_FMT_I8);
+ * assert (sz == 1 && "int8 takes 1 bytes")
+ * @return byte size of fmt
+ */
+int cvm_bytesize_of_fmt(cvk_fmt_t fmt);
+
+/**
+ * @brief reduce multiplication for h,w
+ * the possible shape will be <1, c, 1, 1>
+ * you could refer [here](https://en.wikipedia.org/wiki/Reduction_Operator) for
+ * more details
+ *
+ * @param cvk_ctx kernel structure
+ * @param [out] mp_tl_mulsum input tensor in tpu memory, the shape should be <1, c, h, w>
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_reduce_hw_mul(cvk_context_t *cvk_ctx, cvk_tl_t *mp_tl_mulsum);
+
+/**
+ * @brief bf16 to fp32, ONLY move bf16 to fp32 high 16 bits part,
+ * the memory layout as following:
+ *
+ * bf16: 0x4300
+ * 0 16    (bit)
+ * -----
+ * 0x4300
+ *
+ * fp32: 0x43000000
+ * -----
+ * 0 16 32
+ * 0x  0x43
+ *
+ * @param cvk_ctx kernel structure
+ * @param tg_bf16 bf16 data in device memory
+ * @param [out] tg_fp32 fp32 data in decive memory, the w shape SHOULD be double with
+ * \tg_bf16->shape.w
+ */
+void cvm_bf16_fp32(cvk_context_t *cvk_ctx, cvk_tg_t *tg_bf16, cvk_tg_t *tg_fp32);
+
+/**
+ * @brief set value by mask(0/1)
+ *
+ * @param [in] tl_ifmap image input, MUST uint8
+ * @param [in] tl_mask mask value, it MUST be 0 or 1, it will DIRTY it
+ * @param [in] tl_buf
+ * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_set_image_by_u8mask(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                            cvk_tl_t *tl_mask, cvk_tl_t *tl_ofmap);
+
+/**
+ * @brief set value by mask(0/1) by DePthwise
+ * 0 means keep \tl_ofmap one
+ * 1 means overwrite with \tl_ifmap
+ *
+ * @param [in] tl_ifmap image input, MUST uint8
+ * @param [in] tl_mask mask value, it MUST be 0 or 1, it will DIRTY it
+ * @param [in] tl_kernel for mask reverting(0/1->1/0) that the contain MUST BE -1 with int8
+ * and shape SHOULD BE <1, tl_ifmap->shape.c, 1, 1>
+ * @param [in] tl_bias for mask reverting(0/1->1/0) that the contain MUST BE 1 with int8,
+ * seperate high/low part, and shape SHOULD BE <2, tl_ifmap->shape.c, 1, 1>
+ * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+
+int cvm_set_image_by_u8mask_dp(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_mask,
+                               cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, cvk_tl_t *tl_ofmap);
+
+/**
+ * @brief set value by mask and threshold, set it
+ * if \tl_mask && (int8_t)\tl_update_tbl < threshold
+ *
+ * @param [in] tl_ifmap image input, MUST uint8
+ * @param [in] tl_mask mask value, it MUST be 0 or 1, it will DIRTY it
+ * @param [in] tl_update_tbl the value range will under int8, it will DIRTY it
+ * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_set_image_by_two_info_i8(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf2,
+                                 cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl, uint8_t threshold,
+                                 cvk_tl_t *tl_ofmap);
+
+/**
+ * @brief set value by mask and threshold by DePthwise, set it
+ * if \tl_mask && (int8_t)\tl_update_tbl < threshold
+ *
+ * @param [in] tl_ifmap image input, MUST uint8
+ * @param [in] tl_kernel set all to 1 for \tl_update_tbl * 1 - threshold
+ * to test larger or smaller,
+ * that MUST BE 1 with int8 and shape SHOULD BE <1, tl_ifmap->shape.c, 1, 1>
+ * @param [in] tl_mask mask value, it MUST be 0 or 1, it will DIRTY it
+ * @param [in] tl_update_tbl the value range will under int8, it will DIRTY it
+ * @param [in] tl_threshold for boradcast \threshold to bias
+ * the type MUST BE int8 and seperate high/low part and it will DIRTY it
+ * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+
+int cvm_set_image_by_two_info_i8_dp(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_kernel,
+                                    cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
+                                    cvk_tl_t *tl_threshold, cvk_tl_t *tl_ofmap);
+
+/**
+ * @brief get abs(\tl_ifmap-tl_ifmap2)
+ *
+ * @param [in] tl_ifmap image input, MUST uint8
+ * @param [in] tl_ifmap2 image input, MUST uint8, it will DIRTY it
+ * @param [out] tl_ofmap image output, MUST uint8, it will DIRTY it
+ *
+ * @return status, 0 means success, o, MUST uint8ther means generates command fail
+ */
+int cvm_gen_image_diff(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
+                       cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2, cvk_tl_t *tl_ofmap);
+
+/**
+ * @brief update \tl_ofmap by \threshold_a, \threshold_b,
+ *  plz refer \sample_set_val_by_mask.cpp for more details
+ *
+ * @param [out] tl_mask return 0/1 mask
+ * @param [in] tl_update_tbl u8
+ * @param [in,out] tl_ofmap image output, int8
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_update_tbl_by_threshold(cvk_context_t *ctx, cvk_tl_t *tl_mask, cvk_tl_t *tl_buf,
+                                cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_update_tbl,
+                                uint8_t threshold_a, uint8_t threshold_b, cvk_tl_t *tl_ofmap);
+
+/**
+ * @brief set value by mask, update \tl_ofmap once (uint8_t)tl_update_tbl >= threshold
+ *
+ * @param [in] tl_ifmap image input, MUST uint8
+ * @param [in] tl_update_tbl the value range will under uint8
+ * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_set_image_by_two_info_u8(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                                 cvk_tl_t *tl_buf2, cvk_tl_t *tl_update_tbl, uint8_t threshold,
+                                 cvk_tl_t *tl_ofmap);
+
+/**
+ * @brief set value by mask
+ * if (int8_t)\tl_update_tbl > threshold
+ *
+ * @param [in] tl_ifmap image input
+ * @param [in] tl_update_tbl int8, MUST uint8, it will DIRTY
+ * @param [in,out] tl_ofmap image output, MUST uint8, it will DIRTY it
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+int cvm_blend_image_by_tbl(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                           cvk_tl_t *tl_buf2, cvk_tl_t *tl_update_tbl, uint8_t threshold,
+                           uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap);
+/**
+ * @brief get upsample 2d with nearest mode
+ *
+ * @param [in] tl_ifmap
+ * @param [in] tl_weight upsample used that fill with 1
+ * @param [out] tl_ofmap
+ *
+ * @return status, 0 means success, other means generates command fail
+ */
+
+int cvm_upsample2d(cvk_context_t *ctx, cvk_tl_t *tl_input, cvk_tl_t *tl_weight,
+                   cvk_tl_t *tl_output);
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // CVIMATH_INTERNAL_H
diff --git a/cvimath/include/test_cvikernel_util.h b/cvimath/include/test_cvikernel_util.h
new file mode 100644
index 000000000..75fc164f5
--- /dev/null
+++ b/cvimath/include/test_cvikernel_util.h
@@ -0,0 +1,393 @@
+#ifndef CVIMATH_TEST_UTIL_H
+#define CVIMATH_TEST_UTIL_H
+
+#include <cviruntime_context.h>
+#include "cvikernel/cvikernel.h"
+
+#include "bmruntime.h"
+#include "bmruntime_bmkernel.h"
+
+#include <assert.h>
+#include <math.h>    // pow
+#include <stdint.h>  // uint8_t / uint16_t
+#include <stdio.h>   /* printf, scanf, NULL */
+#include <stdlib.h>  /* malloc, free, rand */
+#include <string.h>  // strncpy
+
+// copy from lagency
+// TODO: move to properly header files
+#define __ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask))
+#define ALIGN(x, a) __ALIGN_MASK(x, (__typeof__(x))(a)-1)
+typedef uint32_t laddr_t;
+typedef uint64_t gaddr_t;
+typedef uint32_t ctrl_t;
+#define CTRL_NULL 0
+#define CTRL_AL (1 << 0)       // alloc aligned with EU_NUM
+#define CTRL_TP (1 << 5)       // transpose
+#define CTRL_NEURON (1 << 11)  // mark neuron address in GDMA
+
+#define LADDR_INVALID (0xFFFFFFFF)
+#define GADDR_INVALID (0x000000FFFFFFFFFFULL)
+static inline int ceiling_func(int numerator, int denominator) {
+  return (numerator + denominator - 1) / denominator;
+}
+static inline int ceiling_func_shift(int numerator, int shift) {
+  return (numerator + (1 << shift) - 1) >> shift;
+}
+static inline int get_num_shift(uint64_t num) {
+  int n = 0;
+  while (!(num & 1)) {
+    n++;
+    num >>= 1;
+  }
+  return n;
+}
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+ * bm runtime binds with bm kernel.
+ * cvi kernel still needs bm runtime.
+ *
+ * Need to create the separate function to combine bm runtime and cvi kernel.
+ * Function with postfix _comp (compatible) for such combination.
+ */
+
+#define __FILENAME__ (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
+
+/**
+ * @brief submit command buffer
+ *
+ * @param rt_ctx runtime structure
+ * @param cvk_ctx kernel structure
+ */
+void test_submit_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx);
+
+/**
+ * @brief alloc tensor from device memory
+ *
+ * @param rt_ctx runtime structure
+ * @param cvk_ctx kernel structure
+ * @param shape tensor shape
+ * @param fmt tensor format such as \CVK_FMT_U16 or \CVK_FMT_U8
+ *
+ * @return cvk_tg_t structure
+ */
+cvk_tg_t *test_alloc_tg_mem_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx,
+                                 cvk_tg_shape_t shape, cvk_fmt_t fmt);
+
+/**
+ * @brief alloc matrix from device memory
+ *
+ * @param rt_ctx runtime structure
+ * @param shape matrix shape
+ * @param fmt tensor format such as \CVK_FMT_U16 or \CVK_FMT_U8
+ *
+ * @return cvk_mg_t structure
+ */
+cvk_mg_t *test_alloc_mg_mem_comp(CVI_RT_HANDLE *rt_ctx, cvk_mg_shape_t shape, cvk_fmt_t fmt);
+
+/**
+ * @brief free tensor from device memory
+ *
+ * @param rt_ctx runtime structure
+ * @param tg pointer of tg
+ */
+void test_free_tg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_tg_t *tg);
+
+/**
+ * @brief free matrix from device memory
+ *
+ * @param rt_ctx runtime structure
+ * @param mg pointer of mg
+ */
+void test_free_mg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_mg_t *mg);
+
+/**
+ * @brief put host data to alloced tensor device memory
+ *
+ * @param rt_ctx runtime structure
+ * @param tg pointer of tg
+ * @param data[] host data
+ */
+void test_put_tg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_tg_t *tg, uint8_t data[]);
+
+/**
+ * @brief put host data to alloced matrix device memory
+ *
+ * @param rt_ctx runtime structure
+ * @param mg pointer of mg
+ * @param data[] host data
+ */
+void test_put_mg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_mg_t *mg, uint8_t data[]);
+
+/**
+ * @brief syntactic sugar for \test_alloc_mg_mem_comp -> \test_put_mg_mem_comp
+ *
+ * @param rt_ctx runtime structure
+ * @param mg_data_format mg format such as \CVK_FMT_U16 or \CVK_FMT_U8
+ * @param data[] host data
+ *
+ * @return
+ */
+cvk_mg_t *test_put_matrix_g(CVI_RT_HANDLE *rt_ctx, const cvk_mg_shape_t shape,
+                            cvk_fmt_t mg_data_format, uint8_t data[]);
+
+/**
+ * @brief get tensor data from device memory
+ *
+ * @param rt_ctx runtime structure
+ * @param tg pointer of tg
+ *
+ * @return data in device memory
+ */
+uint8_t *test_get_tg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_tg_t *tg);
+
+/**
+ * @brief get matrix data from device memory
+ *
+ * @param rt_ctx runtime structure
+ * @param mg pointer of mg
+ *
+ * @return data in device memory
+ */
+uint8_t *test_get_mg_mem_comp(CVI_RT_HANDLE *rt_ctx, const cvk_mg_t *mg);
+
+/**
+ * @brief get tensor data from tpu memory,
+ * the data path should be tpu memory -> device memory -> host memory
+ *
+ * @param rt_ctx runtime structure
+ * @param cvk_ctx kernel structure
+ * @param tl pointer of tl
+ *
+ * @return data in tpu memory
+ */
+uint8_t *test_get_tensor_l2g_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx,
+                                  const cvk_tl_t *tl);
+
+/**
+ * @brief get matrix data from tpu memory,
+ * the data path should be tpu memory -> device memory -> host memory
+ *
+ * @param rt_ctx runtime structure
+ * @param cvk_ctx kernel structure
+ * @param ml pointer of ml
+ *
+ * @return data in tpu memory
+ */
+uint8_t *test_get_matrix_l2g_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx,
+                                  const cvk_ml_t *ml);
+
+/**
+ * @brief put host data to tpu memory with tensor
+ * the data path should be host memory -> device memory -> tpu memory
+ *
+ * @param rt_ctx runtime structure
+ * @param cvk_ctx kernel structure
+ * @param tl pointer of tl
+ * @param data[] data in host memory
+ */
+void test_put_tensor_g2l_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, const cvk_tl_t *tl,
+
+                              uint8_t data[]);
+
+/**
+ * @brief put host data to tpu memory with matrix
+ * the data path should be host memory -> device memory -> tpu memory
+ *
+ * @param rt_ctx runtime structure
+ * @param cvk_ctx kernel structure
+ * @param ml pointer of ml
+ * @param data[] data in host memory
+ */
+void test_put_matrix_g2l_comp(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, const cvk_ml_t *ml,
+                              uint8_t data[]);
+
+/**
+ * @brief alloc tensor from tpu memory
+ *
+ * @param cvk_ctx kernel structure
+ * @param shape shape of tensor
+ * @param fmt tensor format such as \CVK_FMT_U16 or \CVK_FMT_U8
+ * @param eu_align is align excution unit
+ *
+ * @return pointer of tl
+ */
+cvk_tl_t *test_alloc_tl(cvk_context_t *cvk_ctx, cvk_tl_shape_t shape, cvk_fmt_t fmt, int eu_align);
+
+/**
+ * @brief free tpu  memory with tensor
+ *
+ * @param cvk_ctx kernel structure
+ * @param tl pointer of tl
+ */
+void test_free_tl(cvk_context_t *cvk_ctx, const cvk_tl_t *tl);
+
+
+/**
+ * @brief a small structure for getting RT memory information
+ */
+typedef struct _AddrInfo
+{
+  uint64_t phy_addr;
+  uint64_t size_bytes;
+  uint8_t *vir_addr;
+  int mem;
+}AddrInfo;
+
+/**
+ * @brief get tpu global memory and assign info to an structure
+ *
+ * @param[in]  bm_ctx runtime structure
+ * @param[out] pAddrInfo a structure for physical, virtual address
+ */
+uint8_t *test_get_vp_addr(bmctx_t *ctx, AddrInfo *pAddrInfo);
+
+/**
+ * @brief free tpu global memory from an info structure
+ *
+ * @param[in] bm_ctx runtime structure
+ * @param[in] pAddrInfo a structure for physical, virtual address
+ */
+void test_free_vp_addr(bmctx_t *ctx,  AddrInfo *pAddrInfo);
+
+
+/**
+ * @breif wrapper function
+ */
+// tensor in local functions
+// get tl size
+static inline uint64_t tl_shape_size(const cvk_tl_shape_t *s) {
+  return (uint64_t)s->n * s->c * s->h * s->w;
+}
+
+static inline uint64_t tg_shape_size(const cvk_tg_shape_t *s) {
+  return (uint64_t)s->n * s->c * s->h * s->w;
+}
+
+static inline uint64_t mg_shape_size(const cvk_mg_shape_t *s) { return (uint64_t)s->row * s->col; }
+
+static inline void free_tl(cvk_context_t *cvk_ctx, const cvk_tl_t *t) {
+  return cvk_ctx->ops->lmem_free_tensor(cvk_ctx, t);
+}
+
+typedef struct {
+  cvk_fmt_t src_fmt;
+  cvk_fmt_t dst_fmt;
+} cvk_fmt_type;
+
+static inline int bitsize_of_fmt(cvk_fmt_t fmt) {
+  switch (fmt) {
+    case CVK_FMT_F32:
+    case CVK_FMT_I32:
+      return 32;
+    case CVK_FMT_F16:
+    case CVK_FMT_I16:
+    case CVK_FMT_U16:
+    case CVK_FMT_BF16:
+      return 16;
+    case CVK_FMT_I8:
+    case CVK_FMT_U8:
+      return 8;
+    case CVK_FMT_I4:
+      return 4;
+    case CVK_FMT_I2:
+      return 2;
+    case CVK_FMT_I1:
+      return 1;
+    default:
+      assert(0);
+      return -1;
+  }
+}
+static inline int bytesize_of_fmt(cvk_fmt_t fmt) { return bitsize_of_fmt(fmt) / 8; }
+static inline void tg_2_tl_shape(cvk_tl_shape_t *tl, cvk_tg_shape_t *tg) {
+  tl->n = tg->n;
+  tl->c = tg->c;
+  tl->h = tg->h;
+  tl->w = tg->w;
+}
+
+static inline void tl_2_tg_shape(cvk_tg_shape_t *tg, cvk_tl_shape_t *tl) {
+  tg->n = tl->n;
+  tg->c = tl->c;
+  tg->h = tl->h;
+  tg->w = tl->w;
+}
+/**
+ * @brief init test case with runtime/kernel
+ *
+ * @param rt_ctx runtime structure
+ * @param cvk_ctx kernel structure
+ */
+// static inline void _test_init(CVI_RT_HANDLE ctx, cvk_context_t **cvk_ctx) {
+//  CVI_RT_HANDLE _ctx = (CVI_RT_HANDLE)ctx;
+//  int ret = CVI_RT_Init(&_ctx);
+//  if (ret != CVI_SUCCESS) {
+//    fprintf(stderr, "init failed, err %d\n", ret);
+//    exit(-1);
+//  }
+//
+//  int alloc_size = 0x10000;
+//  *cvk_ctx = (cvk_context_t*) CVI_RT_RegisterKernel(_ctx, alloc_size);
+//  printf("alloc command buffer %d bytes success\n", alloc_size);
+//}
+// static inline void _test_exit(CVI_RT_HANDLE ctx, cvk_context_t *cvk_ctx) {
+//  CVI_RT_UnRegisterKernel(cvk_ctx);
+//  CVI_RT_HANDLE _ctx = (CVI_RT_HANDLE)ctx;
+//  CVI_RT_DeInit(_ctx);
+//}
+
+static inline void test_init(CVI_RT_HANDLE *ctx, cvk_context_t **cvk_ctx) {
+  CVI_RT_HANDLE *_ctx = (CVI_RT_HANDLE *)ctx;
+  int ret = CVI_RT_Init(_ctx);
+  if (ret != CVI_SUCCESS) {
+    fprintf(stderr, "init failed, err %d\n", ret);
+    exit(-1);
+  }
+
+  int alloc_size = 0x100000;
+  *cvk_ctx = (cvk_context_t *)CVI_RT_RegisterKernel(*_ctx, alloc_size);
+  printf("alloc command buffer %d bytes success\n", alloc_size);
+}
+
+/**
+ * @brief de-init with runtime/kernel
+ *
+ * @param rt_ctx runtime structure
+ * @param cvk_ctx kernel structure
+ */
+static inline void test_exit(CVI_RT_HANDLE *ctx, cvk_context_t *cvk_ctx) {
+  CVI_RT_UnRegisterKernel(cvk_ctx);
+  CVI_RT_HANDLE *_ctx = (CVI_RT_HANDLE *)ctx;
+  CVI_RT_DeInit(*_ctx);
+}
+
+// converter bf16<->int8
+uint8_t convert_bf16_u8(uint16_t data);
+int8_t convert_bf16_s8(uint16_t data);
+uint16_t convert_int8_bf16(uint8_t data, uint8_t sign);
+uint32_t convert_fp32_u32(float fp32);
+float convert_hex_fp32(uint32_t hval);
+uint32_t convert_fp32_hex(float val);
+float convert_bf16_fp32(uint16_t bf16);
+uint16_t convert_fp32_bf16(float fp32);
+int set_store_feround();
+void restore_feround(int round_mode);
+
+static inline void *xmalloc(size_t size) {
+  void *p = malloc(size);
+  if (!p) {
+    return NULL;
+  }
+  return p;
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // CVIMATH_TEST_UTIL_H
diff --git a/cvimath/sample/CMakeLists.txt b/cvimath/sample/CMakeLists.txt
new file mode 100644
index 000000000..bd5d55385
--- /dev/null
+++ b/cvimath/sample/CMakeLists.txt
@@ -0,0 +1,28 @@
+project(cvimath_sample)
+
+# wrapper source
+
+# include header
+include_directories(
+  ${CMAKE_SOURCE_DIR}/include
+  ${TPU_SDK_ROOT}/include
+  ${TPU_SDK_ROOT}/include/cvimath
+  )
+
+# add libs
+set( TPU_KERNEL_LIB "-L${TPU_SDK_ROOT}/lib -lcvikernel")
+set( TEST_LIBS cvimath cviruntime)
+
+file(GLOB CVI1835_SAMPLE ./*.cpp)
+
+foreach(SAMPLE_SRC ${CVI1835_SAMPLE})
+    get_filename_component(SAMPLE_NAME ${SAMPLE_SRC} NAME_WE)
+
+    add_executable(${SAMPLE_NAME} ${SAMPLE_UTIL} ${SAMPLE_SRC})
+    target_link_libraries(${SAMPLE_NAME} ${TPU_KERNEL_LIB} ${TEST_LIBS})
+    set_target_properties(${SAMPLE_NAME} PROPERTIES COMPILE_FLAGS "-Werror -Wall -Wextra")
+    install(TARGETS ${SAMPLE_NAME} DESTINATION bin)
+
+    add_test(${SAMPLE_NAME} ${SAMPLE_NAME} ctest_test)
+
+endforeach()
diff --git a/cvimath/sample/README.md b/cvimath/sample/README.md
new file mode 100644
index 000000000..8afdc32cd
--- /dev/null
+++ b/cvimath/sample/README.md
@@ -0,0 +1,21 @@
+# CVIMath
+
+## How to build
+
+### Requirements
+
+1. MLIR SDK
+
+SOC mode
+
+```
+$ mkdir build
+$ cd build
+$ cmake -G Ninja .. -DCMAKE_BUILD_WITH_INSTALL_RPATH=ON \
+                    -DTOOLCHAIN_ROOT_DIR=${PWD}/../../gcc-linaro-6.3.1-2017.05-x86_64_aarch64-linux-gnu \
+                    -DCMAKE_TOOLCHAIN_FILE=${PWD}/../toolchain-aarch64-linux.cmake \
+                    -DCMAKE_BUILD_TYPE=Release \
+                    -DCMAKE_INSTALL_PREFIX= \
+                    -DTPU_SDK_ROOT=
+$ ninja -j8 && ninja install
+```
diff --git a/cvimath/sample/sample_bf16_fp32.cpp b/cvimath/sample/sample_bf16_fp32.cpp
new file mode 100644
index 000000000..e4f0f9a64
--- /dev/null
+++ b/cvimath/sample/sample_bf16_fp32.cpp
@@ -0,0 +1,130 @@
+// \file mask sample for gt(great than), ge(great equal), eq(equal), lt(less than), le(less equal)
+
+// header include
+#include <assert.h>
+#include <cvimath_internal.h>     // math
+#include <test_cvikernel_util.h>  // kerenl
+
+void init_input(uint16_t *input_data, uint64_t ifmap_size) {
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    input_data[i] = convert_fp32_bf16(i * 1.0);
+  }
+}
+
+void init_ref(uint16_t *input_data, uint32_t *ref_data, uint64_t ifmap_size) {
+  union s {
+    uint16_t int16[2];  // big endian
+    uint32_t int32;
+  };
+  union s _s;
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    _s.int16[0] = 0;
+    _s.int16[1] = input_data[i];
+    ref_data[i] = _s.int32;
+  }
+}
+
+static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx,
+                      cvk_tg_shape_t *bf16_tg_shape) {
+  // for calculate size we need in host
+  cvk_tl_shape_t ifmap_shape = {bf16_tg_shape->n, bf16_tg_shape->c, bf16_tg_shape->h,
+                                bf16_tg_shape->w};
+
+  // * 2 means fp32 takes twice size of bf16
+  cvk_tl_shape_t ofmap_shape = {bf16_tg_shape->n, bf16_tg_shape->c, bf16_tg_shape->h,
+                                bf16_tg_shape->w * 2};
+
+  uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
+  uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
+
+  // unit size is 1 bytes, bf16 takes 2 bytes
+  int data_type_size = 2;
+
+  uint64_t ifmap_bytesize = ifmap_size * data_type_size;
+
+  // * 2 means fp32 takes twice size of bf16
+  uint64_t ofmap_bytesize = ofmap_size * data_type_size * 2;
+
+  uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize);
+  uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize);
+
+  // init input / output data in ddr
+  init_input((uint16_t *)input_data, ifmap_size);
+  init_ref((uint16_t *)input_data, (uint32_t *)ref_data, ifmap_size);
+
+  // send host memory->device memory
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tg_shape_t fp32_tg_shape;
+  fp32_tg_shape = {ofmap_shape.n, ofmap_shape.c, ofmap_shape.h, ofmap_shape.w};
+
+  cvk_tg_t *bf16_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, *bf16_tg_shape, fmt);
+  test_put_tg_mem_comp(rt_ctx, bf16_tg, (uint8_t *)input_data);
+
+  cvk_tg_t *fp32_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, fp32_tg_shape, fmt);
+
+  // prepare command buffer
+  cvm_bf16_fp32(cvk_ctx, bf16_tg, fp32_tg);
+
+  // submit descriptor
+  test_submit_comp(rt_ctx, cvk_ctx);
+
+  // get data from tl
+  uint8_t *ofmap_data = test_get_tg_mem_comp(rt_ctx, fp32_tg);
+
+  // compare with reference with byte
+  for (uint32_t i = 0; i < ofmap_size; i++) {
+    if (ref_data[i] != ofmap_data[i]) {
+      fprintf(stderr, "comparing failed output[%u] got %u, ref %u\n", i, ofmap_data[i],
+              ref_data[i]);
+      // fail case
+      exit(-1);
+    }
+  }
+
+  // free resource from tpu memory
+  test_free_tg_mem_comp(rt_ctx, bf16_tg);
+  test_free_tg_mem_comp(rt_ctx, fp32_tg);
+
+  // free resource from host memory
+  free(input_data);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main() {
+  CVI_RT_HANDLE rt_ctx;
+  cvk_context_t *cvk_ctx;
+  int round_mode;
+
+  // align kerenl rounding mode
+  round_mode = set_store_feround();
+
+  // init runtime / kerenl structure
+  test_init(&rt_ctx, &cvk_ctx);
+
+  cvk_tg_shape_t bf16_tg_shape = {1, 2, 3, 4};
+  {
+    // test 1
+    printf("test bf16 <%d,%d,%d,%d> to fp32\n", bf16_tg_shape.n, bf16_tg_shape.c, bf16_tg_shape.h,
+           bf16_tg_shape.w);
+    testbench(&rt_ctx, cvk_ctx, &bf16_tg_shape);
+    printf("compare test bf16 to fp32 done\n");
+  }
+
+  {
+    // test 2
+    bf16_tg_shape = {1, 20, 30, 40};
+    printf("test bf16 <%d,%d,%d,%d> to fp32\n", bf16_tg_shape.n, bf16_tg_shape.c, bf16_tg_shape.h,
+           bf16_tg_shape.w);
+    testbench(&rt_ctx, cvk_ctx, &bf16_tg_shape);
+    printf("compare test bf16 to fp32 done\n");
+  }
+
+  // de-init runtime / kerenl structure
+  test_exit(&rt_ctx, cvk_ctx);
+
+  // restore rounding mode
+  restore_feround(round_mode);
+
+  return 0;
+}
diff --git a/cvimath/sample/sample_fp32_bf16.cpp b/cvimath/sample/sample_fp32_bf16.cpp
new file mode 100644
index 000000000..5e335b8b1
--- /dev/null
+++ b/cvimath/sample/sample_fp32_bf16.cpp
@@ -0,0 +1,109 @@
+// \file mask sample for gt(great than), ge(great equal), eq(equal), lt(less than), le(less equal)
+
+// header include
+#include <assert.h>
+#include <cvimath_internal.h>     // math
+#include <test_cvikernel_util.h>  // kerenl
+
+void init_input(uint32_t *input_data, uint64_t ifmap_size) {
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    input_data[i] = ((0x1234 + i) << 16) + 0x5678 + i;
+  }
+}
+
+static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx,
+                      cvk_tg_shape_t *fp32_tg_shape) {
+  // for calculate size we need in host
+  cvk_tl_shape_t ifmap_shape = {fp32_tg_shape->n, fp32_tg_shape->c, fp32_tg_shape->h,
+                                fp32_tg_shape->w};
+
+  uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
+
+  // unit size is 1 bytes, bf16 takes 2 bytes
+  int data_type_size = 2;
+
+  uint64_t ifmap_bytesize = ifmap_size * data_type_size;
+  uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize);
+  uint64_t ifmap_bytesize_per_fp32 = ifmap_bytesize / 4;  // 4 means float takes 4 bytes
+
+  // init input / output data in ddr
+  init_input((uint32_t *)input_data, ifmap_bytesize_per_fp32);
+
+  // send host memory->device memory
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+
+  cvk_tg_t *fp32_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, *fp32_tg_shape, fmt);
+  test_put_tg_mem_comp(rt_ctx, fp32_tg, (uint8_t *)input_data);
+
+  cvk_tg_shape_t bf16_tg_shape = *fp32_tg_shape;
+  cvk_tg_t *bf16_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, bf16_tg_shape, fmt);
+
+  // prepare command buffer
+  cvm_s2s_fp32_bf16(cvk_ctx, fp32_tg->start_address, fp32_tg->shape, bf16_tg->start_address,
+                    bf16_tg->shape, fmt);
+
+  // submit descriptor
+  test_submit_comp(rt_ctx, cvk_ctx);
+
+  // get data from tl
+  uint8_t *ofmap_data = test_get_tg_mem_comp(rt_ctx, bf16_tg);
+
+  // compare with reference with byte
+  uint16_t *ofmap_data_bf16 = (uint16_t *)ofmap_data;
+  uint32_t *input_data_i32 = (uint32_t *)input_data;
+  for (uint32_t i = 0; i < ifmap_bytesize_per_fp32; i++) {
+    uint16_t _input_data_i16 = (input_data_i32[i] >> 16) & 0xffff;
+    if (_input_data_i16 != ofmap_data_bf16[i]) {
+      fprintf(stderr, "comparing failed output[%u] got %u, ref %u\n", i, ofmap_data_bf16[i],
+              _input_data_i16);
+      // fail case
+      exit(-1);
+    }
+  }
+
+  // free resource from tpu memory
+  test_free_tg_mem_comp(rt_ctx, bf16_tg);
+  test_free_tg_mem_comp(rt_ctx, fp32_tg);
+
+  // free resource from host memory
+  free(input_data);
+  free(ofmap_data);
+}
+
+int main() {
+  CVI_RT_HANDLE rt_ctx;
+  cvk_context_t *cvk_ctx;
+  int round_mode;
+
+  // align kerenl rounding mode
+  round_mode = set_store_feround();
+
+  // init runtime / kerenl structure
+  test_init(&rt_ctx, &cvk_ctx);
+
+  cvk_tg_shape_t fp32_tg_shape = {1, 2, 3, 4};
+  {
+    // test 1
+    printf("test fp32 <%d,%d,%d,%d> to bf16\n", fp32_tg_shape.n, fp32_tg_shape.c, fp32_tg_shape.h,
+           fp32_tg_shape.w);
+    testbench(&rt_ctx, cvk_ctx, &fp32_tg_shape);
+    printf("compare test bf16 to fp32 done\n");
+  }
+
+  {
+    // test 2
+    fp32_tg_shape = {1, 20, 30, 40};
+    printf("test fp32 <%d,%d,%d,%d> to bf16\n", fp32_tg_shape.n, fp32_tg_shape.c, fp32_tg_shape.h,
+           fp32_tg_shape.w);
+    testbench(&rt_ctx, cvk_ctx, &fp32_tg_shape);
+    printf("compare test bf16 to fp32 done\n");
+  }
+
+  // de-init runtime / kerenl structure
+  test_exit(&rt_ctx, cvk_ctx);
+
+  // restore rounding mode
+  restore_feround(round_mode);
+
+  return 0;
+}
diff --git a/cvimath/sample/sample_gemm.cpp b/cvimath/sample/sample_gemm.cpp
new file mode 100644
index 000000000..56d515ccf
--- /dev/null
+++ b/cvimath/sample/sample_gemm.cpp
@@ -0,0 +1,312 @@
+// \file sample for gemm(general matrix multiply)
+
+// header include
+#include <assert.h>
+#include <cvimath_internal.h>     // math
+#include <test_cvikernel_util.h>  // kerenl
+
+#include <sys/time.h>  // int gettimeofday
+#include <time.h>      /* clock_t, clock, CLOCKS_PER_SEC */
+
+typedef cvk_tiu_matrix_multiplication_param_t param_t;
+
+// comes from
+// https://stackoverflow.com/questions/47023651/multiplying-matrices-in-one-dimensional-arrays
+void multiply(uint16_t *a, int row1, int col1, uint16_t *b, int row2, int col2, uint16_t *d) {
+  assert(col1 == row2);
+  // silence error=unused-but-set-parameter warning
+  (void)row2;
+
+  for (int i = 0; i < row1; i++) {
+    for (int j = 0; j < col2; j++) {
+      float sum = 0;
+      for (int k = 0; k < col1; k++) {
+        float _a = convert_bf16_fp32(a[i * col1 + k]);
+        float _b = convert_bf16_fp32(b[k * col2 + j]);
+        sum = sum + _a * _b;
+      }
+      d[i * col2 + j] = convert_fp32_bf16(sum);
+    }
+  }
+}
+
+static void multiply_i32(uint8_t *a, int row1, int col1, uint8_t *b, int row2, int col2,
+                         uint32_t *d, cvk_fmt_t fmt) {
+  assert(col1 == row2);
+  // silence error=unused-but-set-parameter warning
+  (void)row2;
+
+  for (int i = 0; i < row1; i++) {
+    for (int j = 0; j < col2; j++) {
+      int sum = 0;
+      for (int k = 0; k < col1; k++) {
+        int _a = fmt == CVK_FMT_I8 ? (int8_t)(a[i * col1 + k]) : (a[i * col1 + k]);
+        int _b = fmt == CVK_FMT_I8 ? (int8_t)(b[k * col2 + j]) : (b[k * col2 + j]);
+        sum = sum + _a * _b;
+      }
+      d[i * col2 + j] = (sum);
+    }
+  }
+}
+
+// compare with uint16_t type
+int array_cmp_int16(const char *const info, const uint16_t *p_exp, const uint16_t *p_got,
+                    int count) {
+  int idx;
+  for (idx = 0; idx < count; idx++) {
+    if (p_exp[idx] != p_got[idx]) {
+      printf("%s error at index %d exp %d(%f,0x%x) got %d(%f,0x%x)\n", info, idx, p_exp[idx],
+             convert_bf16_fp32(p_exp[idx]), p_exp[idx], p_got[idx], convert_bf16_fp32(p_got[idx]),
+             p_got[idx]);
+      return -1;
+    }
+  }
+  return 0;
+}
+
+static int array_cmp_int32(const char *const info, const uint32_t *p_exp, const uint32_t *p_got,
+                           int count) {
+  int idx;
+  for (idx = 0; idx < count; idx++) {
+    if (p_exp[idx] != p_got[idx]) {
+      printf("%s error at index %d exp %d got %d\n", info, idx, p_exp[idx], p_got[idx]);
+      return -1;
+    }
+  }
+  return 0;
+}
+
+static cvk_mg_t *_test_put_matrix_g(CVI_RT_HANDLE *rt_ctx, size_t row, size_t col,
+                                    cvk_fmt_t mg_data_format, uint8_t data[]) {
+  cvk_mg_shape_t s;
+  s.row = row;
+  s.col = col;
+  return test_put_matrix_g(rt_ctx, s, mg_data_format, data);
+}
+
+static void assign_bf16_values_to_matrix(uint16_t *matrix, size_t size) {
+  float t;
+  for (size_t i = 0; i < size; i++) {
+    float f;
+#if 1
+    // simple pattern
+    if (i % 2 == 0) t = i % 8;
+    if (i % 2 == 1) t = -1 * (i % 8);
+    f = t;
+#else
+    t = i * (i % 2 ? -1 : 1);
+    f = t * 0.01 + size * 0.01;
+#endif
+    matrix[i] = convert_fp32_bf16(f);
+  }
+}
+
+static void assign_i8_values_to_matrix(uint8_t *matrix, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    matrix[i] = i + 20;
+  }
+}
+
+static int test_gemm_bf16(size_t M, size_t N, size_t K) {
+  long elapsed;
+  struct timeval t0, t1;
+  int ret = 0;
+
+  // alloc test data in host
+  uint16_t *bf16_A = new uint16_t[M * K];
+  uint16_t *bf16_B = new uint16_t[N * K];
+  uint16_t *bf16_R = new uint16_t[2 * M * N];
+  uint16_t *int16_C_ref = new uint16_t[M * N];
+
+  // assign data
+  assign_bf16_values_to_matrix(bf16_A, M * K);
+  assign_bf16_values_to_matrix(bf16_B, N * K);
+
+  gettimeofday(&t0, NULL);
+
+  multiply(bf16_A, M, K, bf16_B, K, N, int16_C_ref);
+
+  gettimeofday(&t1, NULL);
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+  printf("CPU GEMM takes %ld us\n", elapsed);
+
+  CVI_RT_HANDLE ctx;
+  cvk_context_t *bk_ctx;
+
+  // init runtime / kerenl structure
+  test_init(&ctx, &bk_ctx);
+
+  // alloc device memory and put data to device
+  cvk_mg_t *mg_A = _test_put_matrix_g(&ctx, M, K, CVK_FMT_BF16, (uint8_t *)bf16_A);
+  cvk_mg_t *mg_B = _test_put_matrix_g(&ctx, K, N, CVK_FMT_BF16, (uint8_t *)bf16_B);
+  cvk_mg_t *mg_R = _test_put_matrix_g(&ctx, M * 2, N, CVK_FMT_BF16, (uint8_t *)bf16_R);
+
+  // get device address for gemm
+  gaddr_t gaddr_a = mg_A->start_address;
+  gaddr_t gaddr_b = mg_B->start_address;
+  gaddr_t gaddr_r = mg_R->start_address;
+
+  // prepare gemm descriptor
+  size_t *slice_num =
+      cvm_gemm((cvk_context_t *)bk_ctx, gaddr_a, gaddr_b, gaddr_r, M, K, N, CVK_FMT_BF16);
+
+  // submit descriptor
+  gettimeofday(&t0, NULL);
+  test_submit_comp(&ctx, bk_ctx);
+  gettimeofday(&t1, NULL);
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+
+  printf("GEMM takes %ld us\n", elapsed);
+
+  // get result from device to host
+  uint16_t *bf16_ref = (uint16_t *)test_get_mg_mem_comp(&ctx, mg_R);
+
+  // compare, exit once compare fail in
+  int cmp_res = array_cmp_int16("gemm", int16_C_ref, bf16_ref, M * N);
+  if (cmp_res != 0) {
+    ret = -1;
+    printf("Comparison failed for cblas_sgemm and bmblas_gemm!\n");
+  }
+
+  // free device resource
+  test_free_mg_mem_comp(&ctx, mg_A);
+  test_free_mg_mem_comp(&ctx, mg_B);
+  test_free_mg_mem_comp(&ctx, mg_R);
+
+  // de-init runtime / kerenl structure
+  test_exit(&ctx, bk_ctx);
+
+  // free resource from host
+  delete[] bf16_A;
+  delete[] bf16_B;
+  delete[] bf16_R;
+  delete[] int16_C_ref;
+  free(bf16_ref);
+  free(slice_num);
+
+  return ret;
+}
+
+static int test_gemm_i8(size_t M, size_t N, size_t K, cvk_fmt_t fmt) {
+  long elapsed;
+  struct timeval t0, t1;
+  int ret = 0;
+
+  // 4 means 32bit takes 4 times size of uint8_t
+  int uint32_per_uint8 = sizeof(uint32_t) / sizeof(uint8_t);
+
+  // alloc test data in host
+  uint8_t *i8_A = new uint8_t[M * K];
+  uint8_t *i8_B = new uint8_t[N * K];
+  uint8_t *i8_R = new uint8_t[uint32_per_uint8 * M * N];
+  uint32_t *int32_C_ref = new uint32_t[M * N];
+
+  // assign data
+  assign_i8_values_to_matrix(i8_A, M * K);
+  assign_i8_values_to_matrix(i8_B, N * K);
+
+  // measure cpu time
+  gettimeofday(&t0, NULL);
+
+  multiply_i32(i8_A, M, K, i8_B, K, N, int32_C_ref, fmt);
+
+  gettimeofday(&t1, NULL);
+
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+  printf("CPU GEMM takes %ld us\n", elapsed);
+
+  // alloc runtime
+  CVI_RT_HANDLE ctx;
+  cvk_context_t *bk_ctx;
+
+  // init runtime / kerenl structure
+  test_init(&ctx, &bk_ctx);
+
+  // alloc device memory and put data to device
+  cvk_mg_t *mg_A = _test_put_matrix_g(&ctx, M, K, CVK_FMT_I8, (uint8_t *)i8_A);
+  cvk_mg_t *mg_B = _test_put_matrix_g(&ctx, K, N, CVK_FMT_I8, (uint8_t *)i8_B);
+  cvk_mg_t *mg_R = _test_put_matrix_g(&ctx, M * uint32_per_uint8, N, CVK_FMT_I8, (uint8_t *)i8_R);
+
+  // get device address for gemm
+  gaddr_t gaddr_a = mg_A->start_address;
+  gaddr_t gaddr_b = mg_B->start_address;
+  gaddr_t gaddr_r = mg_R->start_address;
+
+  // prepare gemm descriptor
+  size_t *slice_num = cvm_gemm((cvk_context_t *)bk_ctx, gaddr_a, gaddr_b, gaddr_r, M, K, N, fmt);
+
+  gettimeofday(&t0, NULL);
+
+  // submit descriptor
+  test_submit_comp(&ctx, bk_ctx);
+
+  gettimeofday(&t1, NULL);
+
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+  printf("GEMM takes %ld us\n", elapsed);
+
+  // get result from device to host
+  uint8_t *i8_R_host = (uint8_t *)test_get_mg_mem_comp(&ctx, mg_R);
+
+  // for re-combine
+  uint32_t *i32_C = new uint32_t[M * N];
+
+  if (fmt == CVK_FMT_I8) {
+    cvm_combin_gemm_i8(slice_num, i8_R_host, i32_C, M, N);
+  }
+
+  free(slice_num);
+
+  // compare, exit once compare fail in
+  int cmp_res = array_cmp_int32("gemm", int32_C_ref, i32_C, M * N);
+  if (cmp_res != 0) {
+    ret = -1;
+    printf("Comparison failed for cblas_sgemm and bmblas_gemm!\n");
+  }
+
+  // free device resource
+  test_free_mg_mem_comp(&ctx, mg_A);
+  test_free_mg_mem_comp(&ctx, mg_B);
+  test_free_mg_mem_comp(&ctx, mg_R);
+
+  // de-init runtime / kerenl structure
+  test_exit(&ctx, bk_ctx);
+
+  // free resource from host
+  delete[] i8_A;
+  delete[] i8_B;
+  delete[] i8_R;
+  delete[] int32_C_ref;
+  delete[] i32_C;
+  free(i8_R_host);
+
+  return ret;
+}
+
+static int test_gemm(size_t M, size_t N, size_t K, cvk_fmt_t fmt) {
+  printf("%s: M=%zu, N=%zu, K=%zu\n", __func__, M, N, K);
+  if (fmt == CVK_FMT_BF16) {
+    return test_gemm_bf16(M, N, K);
+  } else {
+    return test_gemm_i8(M, N, K, fmt);
+  }
+}
+
+int main() {
+  int round_mode;
+  // align backend rounding
+  round_mode = set_store_feround();
+
+  if (0 != test_gemm(3, 500, 512, CVK_FMT_BF16)) exit(-1);
+  if (0 != test_gemm(1, 20000, 512, CVK_FMT_I8)) exit(-1);
+
+  // heavy test
+  // if (0 != test_gemm(300, 500, 512, CVK_FMT_BF16)) exit(-1);
+
+  printf("Comparison done for cpu gemm and tpu gemm!\n\n");
+
+  // restore rounding
+  restore_feround(round_mode);
+
+  return 0;
+}
diff --git a/cvimath/sample/sample_mask.cpp b/cvimath/sample/sample_mask.cpp
new file mode 100644
index 000000000..fdf90e19d
--- /dev/null
+++ b/cvimath/sample/sample_mask.cpp
@@ -0,0 +1,175 @@
+// \file mask sample for gt(great than), ge(great equal), eq(equal), lt(less than), le(less equal)
+
+// header include
+#include <assert.h>
+#include <cvimath_internal.h>     // math
+#include <test_cvikernel_util.h>  // kerenl
+
+// global variable for loop all test case
+static enum CVM_MASK_TYPE mode;
+
+// global structure for test
+struct pattern {
+  float *input;  // input
+  float *ref;    // reference output
+  int len;       // data lenth
+#define HELP_LEN (10)
+  char help[HELP_LEN];  // help message
+};
+
+// input
+float cvm_mask_type_gt_0_input[] = {-1 * pow(2, -62), -0.003, -1.0, -100000, 0.000001, 1, 1000,
+                                    pow(2, 62),       0};
+
+// ref, 0 means false, 1 means true
+float cvm_mask_type_gt_0_output[] = {0, 0, 0, 0, 1, 1, 1, 1, 0};
+float cvm_mask_type_ge_0_output[] = {0, 0, 0, 0, 1, 1, 1, 1, 1};
+float cvm_mask_type_eq_0_output[] = {0, 0, 0, 0, 0, 0, 0, 0, 1};
+float cvm_mask_type_lt_0_output[] = {1, 1, 1, 1, 0, 0, 0, 0, 0};
+float cvm_mask_type_le_0_output[] = {1, 1, 1, 1, 0, 0, 0, 0, 1};
+
+// size of input
+int input_sz = sizeof(cvm_mask_type_gt_0_input) / sizeof(cvm_mask_type_gt_0_input[0]);
+
+// init test case
+static struct pattern patterns[] = {
+    {cvm_mask_type_gt_0_input, cvm_mask_type_gt_0_output, input_sz, "gt test"},
+    {cvm_mask_type_gt_0_input, cvm_mask_type_ge_0_output, input_sz, "ge test"},
+    {cvm_mask_type_gt_0_input, cvm_mask_type_eq_0_output, input_sz, "eq test"},
+    {cvm_mask_type_gt_0_input, cvm_mask_type_lt_0_output, input_sz, "lt test"},
+    {cvm_mask_type_gt_0_input, cvm_mask_type_le_0_output, input_sz, "le test"},
+};
+
+static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) {
+  // default test bf16 case
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+
+  struct pattern *p = &patterns[mode];
+
+  // alloc shape, align with \len
+  uint32_t input_n = 1;
+  uint32_t input_c = 1;
+  uint32_t input_h = 1;
+  uint32_t input_w = p->len;
+
+  cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
+  cvk_tl_shape_t ofmap_shape = ifmap_shape;
+
+  uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
+  uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
+
+  // unit size is 1 bytes, bf16 takes 2 bytes
+  int data_type_size = 1;
+  if (fmt == CVK_FMT_BF16) {
+    data_type_size = 2;
+  }
+
+  uint64_t ifmap_bytesize = ifmap_size * data_type_size;
+  uint64_t ofmap_bytesize = ofmap_size * data_type_size;
+
+  // get table shape
+  cvk_tl_shape_t table_shape;
+  uint64_t table_bytesize = cvm_lut_tbl_bytesize(bmk, &table_shape, fmt);
+
+  // alloc input/output tl
+  cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, CTRL_AL);
+  cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, CTRL_AL);
+
+  // alloc lookup table
+  cvk_tl_t *tl_pos_neg_buf = test_alloc_tl(bmk, table_shape, fmt, CTRL_AL);
+  cvk_tl_t *tl_0_idx_table = test_alloc_tl(bmk, table_shape, fmt, CTRL_AL);
+
+  // alloc tmp tl
+  cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, CTRL_AL);
+  cvk_tl_t *tl_buf2 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, CTRL_AL);
+  cvk_tl_t *tl_buf4 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, CTRL_AL);
+
+  // alloc data from ddr
+  uint16_t *input_data = (uint16_t *)xmalloc(ifmap_bytesize);
+  uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
+  uint16_t *table_data_atan_pos_neg = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *idx_0_table_data = (uint16_t *)xmalloc(table_bytesize);
+
+  // init lookup table data in ddr
+  cvm_gen_0_tbl(idx_0_table_data, &table_shape);
+  cvm_pos_neg_tbl(table_data_atan_pos_neg, &table_shape);
+
+  // init input / output data in ddr
+  for (uint32_t i = 0; i < ifmap_size; i++) {
+    input_data[i] = convert_fp32_bf16(p->input[i]);
+    ref_data[i] = convert_fp32_bf16(p->ref[i]);
+  }
+
+  // send ddr data to tl
+  test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)input_data);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_pos_neg_buf, (uint8_t *)table_data_atan_pos_neg);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_0_idx_table, (uint8_t *)idx_0_table_data);
+
+  // emit mask function
+  cvm_emit_mask(bmk,
+                tl_ifmap,                        // input
+                tl_buf, tl_buf2, tl_buf4,        // tmp buffer
+                tl_pos_neg_buf, tl_0_idx_table,  // lookup table
+                tl_ofmap_bf16,                   // output
+                fmt, mode);
+
+  // submit descriptor
+  test_submit_comp(ctx, bmk);
+
+  // get data from tl
+  uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, tl_ofmap_bf16);
+
+  // compare with reference
+  for (uint32_t i = 0; i < ifmap_size; i++) {
+    if (ref_data[i] != ofmap_data[i]) {
+      fprintf(stderr, "comparing failed at mode (%s) output[%u] got %f(0x%x), ref %f(0x%x)\n",
+              p->help, i, convert_bf16_fp32(ofmap_data[i]), ofmap_data[i],
+              convert_bf16_fp32(ref_data[i]), ref_data[i]);
+      // fail case
+      exit(-1);
+    }
+  }
+
+  // free resource from kernel
+  free_tl(bmk, tl_buf4);
+  free_tl(bmk, tl_buf2);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_0_idx_table);
+  free_tl(bmk, tl_pos_neg_buf);
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_ifmap);
+
+  // free resource from heap
+  free(input_data);
+  free(ref_data);
+  free(ofmap_data);
+  free(table_data_atan_pos_neg);
+  free(idx_0_table_data);
+}
+
+int main() {
+  CVI_RT_HANDLE ctx;
+  cvk_context_t *bmk;
+  int round_mode;
+
+  // align kerenl rounding mode
+  round_mode = set_store_feround();
+
+  // init runtime / kerenl structure
+  test_init(&ctx, &bmk);
+
+  for (int i = CVM_MASK_TYPE_GT_0; i < CVM_MASK_MAX; i++) {
+    mode = static_cast<enum CVM_MASK_TYPE>(i);
+    struct pattern *p = &patterns[mode];
+    printf("test %s...\n", p->help);
+    testbench(&ctx, bmk);
+  }
+
+  // de-init runtime / kerenl structure
+  test_exit(&ctx, bmk);
+
+  // restore rounding mode
+  restore_feround(round_mode);
+
+  return 0;
+}
diff --git a/cvimath/sample/sample_reduce_mul.cpp b/cvimath/sample/sample_reduce_mul.cpp
new file mode 100644
index 000000000..5a31d14cc
--- /dev/null
+++ b/cvimath/sample/sample_reduce_mul.cpp
@@ -0,0 +1,160 @@
+// \file mask sample for gt(great than), ge(great equal), eq(equal), lt(less than), le(less equal)
+
+// header include
+#include <assert.h>
+#include <cvimath_internal.h>     // math
+#include <test_cvikernel_util.h>  // kerenl
+
+void init_input(uint8_t *input_data, uint64_t ifmap_bytesize, cvk_fmt_t fmt) {
+  uint32_t fmt_size = cvm_bytesize_of_fmt(fmt);
+  uint64_t sz = ifmap_bytesize / fmt_size;
+  int round = 4;  // random
+  for (uint64_t i = 0; i < sz; i++) {
+    uint8_t r[2];
+    r[0] = i % round;
+    if (r[0] == 0) {
+      r[0] = 1;  // prevent mul to 0
+    }
+
+    if (fmt_size == 2) {
+      // bf16
+      uint16_t bf16 = convert_fp32_bf16((float)r[0]);
+      memcpy(r, &bf16, fmt_size);
+    }
+    memcpy(&input_data[i * fmt_size], r, fmt_size);
+  }
+}
+
+void init_ref(uint8_t *input_data, uint8_t *ref_data, cvk_tl_shape_t *ifmap_shape, cvk_fmt_t fmt) {
+  uint32_t fmt_size = cvm_bytesize_of_fmt(fmt);
+  int ref_idx = 0;
+
+  // reduce ONLY hw
+  for (uint32_t n = 0; n < ifmap_shape->n; n++) {
+    for (uint32_t c = 0; c < ifmap_shape->c; c++) {
+      float tmp = 1;
+      for (uint32_t h = 0; h < ifmap_shape->h; h++) {
+        for (uint32_t w = 0; w < ifmap_shape->w; w++) {
+          uint32_t off = (n * ifmap_shape->c * ifmap_shape->h * ifmap_shape->w +
+                          c * ifmap_shape->h * ifmap_shape->w + h * ifmap_shape->w + w) *
+                         fmt_size;
+          float v;
+          if (fmt_size == 2) {
+            // bf16 case
+            uint16_t bf16;
+            memcpy(&bf16, &input_data[off], fmt_size);
+            v = convert_bf16_fp32(bf16);
+          } else {
+            v = input_data[off];
+          }
+          tmp = v * tmp;
+        }
+      }
+      uint8_t r[2];
+      if (fmt_size == 2) {
+        // bf16 case
+        uint16_t bf16 = convert_fp32_bf16(tmp);
+        memcpy(r, (void *)&bf16, fmt_size);
+      } else {
+        r[0] = tmp;
+      }
+      memcpy(&ref_data[ref_idx * fmt_size], r, fmt_size);
+      ref_idx++;
+    }
+  }
+}
+
+static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, cvk_fmt_t fmt) {
+  // alloc shape, align with \len
+  uint32_t input_n = 1;
+  uint32_t input_c = 3;
+  uint32_t input_h = 2;
+  uint32_t input_w = 2;
+
+  cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
+  // NOTICE: ONLY reduce hw for performance
+  cvk_tl_shape_t ofmap_shape = {input_n, input_c, 1, 1};
+
+  uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
+  uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
+
+  // unit size is 1 bytes, bf16 takes 2 bytes
+  int data_type_size = 1;
+  if (fmt == CVK_FMT_BF16) {
+    data_type_size = 2;
+  }
+
+  uint64_t ifmap_bytesize = ifmap_size * data_type_size;
+  uint64_t ofmap_bytesize = ofmap_size * data_type_size;
+
+  // alloc input/output tl
+  cvk_tl_t *tl_ifmap = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, CTRL_AL);
+
+  // alloc data from ddr
+  uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize);
+  uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize);
+
+  // init input / output data in ddr
+  init_input(input_data, ifmap_bytesize, fmt);
+  init_ref(input_data, ref_data, &ifmap_shape, fmt);
+
+  // send host memory->device memory->tpu_memory
+  test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ifmap, (uint8_t *)input_data);
+
+  // prepare command buffer
+  cvm_reduce_hw_mul(cvk_ctx, tl_ifmap);
+
+  // submit descriptor
+  test_submit_comp(rt_ctx, cvk_ctx);
+
+  // reshape for reduce result
+  tl_ifmap->shape = {tl_ifmap->shape.n, tl_ifmap->shape.c, 1, 1};
+  tl_ifmap->stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_ifmap->shape, tl_ifmap->fmt, 1);
+
+  // get data from tl
+  uint8_t *ofmap_data = test_get_tensor_l2g_comp(rt_ctx, cvk_ctx, tl_ifmap);
+
+  // compare with reference with byte
+  for (uint32_t i = 0; i < ofmap_size; i++) {
+    if (ref_data[i] != ofmap_data[i]) {
+      fprintf(stderr, "comparing failed output[%u] got %u, ref %u\n", i, ofmap_data[i],
+              ref_data[i]);
+      // fail case
+      exit(-1);
+    }
+  }
+
+  // free resource from tpu memory
+  free_tl(cvk_ctx, tl_ifmap);
+
+  // free resource from host memory
+  free(input_data);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main() {
+  CVI_RT_HANDLE rt_ctx;
+  cvk_context_t *cvk_ctx;
+  int round_mode;
+
+  // align kerenl rounding mode
+  round_mode = set_store_feround();
+
+  // init runtime / kerenl structure
+  test_init(&rt_ctx, &cvk_ctx);
+
+  printf("test reduce mul int8\n");
+  testbench(&rt_ctx, cvk_ctx, CVK_FMT_I8);
+
+  printf("test reduce mul bf16\n");
+  testbench(&rt_ctx, cvk_ctx, CVK_FMT_BF16);
+
+  // de-init runtime / kerenl structure
+  test_exit(&rt_ctx, cvk_ctx);
+
+  // restore rounding mode
+  restore_feround(round_mode);
+
+  return 0;
+}
diff --git a/cvimath/sample/sample_set_val_by_mask.cpp b/cvimath/sample/sample_set_val_by_mask.cpp
new file mode 100644
index 000000000..2d5d1b94c
--- /dev/null
+++ b/cvimath/sample/sample_set_val_by_mask.cpp
@@ -0,0 +1,656 @@
+// \file sample for set value by mask, plz refer \cvimath_internal.h for more details
+
+// header include
+#include <assert.h>
+#include <cvimath_internal.h>     // math
+#include <test_cvikernel_util.h>  // kerenl
+
+#include <sys/time.h>  // int gettimeofday
+#include <time.h>      /* clock_t, clock, CLOCKS_PER_SEC */
+
+#define DEBUG 1  // < 0 is disable debug
+#define debug_print(fmt, ...)                     \
+  do {                                            \
+    if (DEBUG) fprintf(stderr, fmt, __VA_ARGS__); \
+  } while (0)
+
+int flip = 0;
+struct testbench {
+  char *name;
+  int (*cvm_run)(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf,
+                 cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias,
+                 uint8_t threshold, uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap);
+  void (*ref)(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask, uint8_t *pNewY, uint8_t *pY,
+              uint8_t *g_update_tbl, uint8_t threshold, uint8_t w1, uint8_t w2);
+  uint8_t threshold;
+  uint8_t w1;
+  uint8_t w2;
+};
+
+static void init_kernel(uint8_t *kernel_data, uint64_t kernel_size, int8_t val) {
+  int8_t *kernel_data_i8 = (int8_t *)kernel_data;
+  for (uint64_t i = 0; i < kernel_size; i++) {
+    kernel_data_i8[i] = val;
+  }
+}
+
+static void init_bias(uint8_t *bias_data, uint64_t bias_size, int16_t val) {
+  int c = bias_size / 2;
+
+  for (int i = 0; i < c; i++) {
+    bias_data[i] = val & 0xff;
+    bias_data[i + c] = (val >> 8) & 0xff;
+  }
+}
+
+static void init_input_2(uint8_t *input_data, uint64_t ifmap_size) {
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    input_data[i] = i * 2 * (i % 3 ? -1 : 1);
+  }
+}
+
+static void init_input_3(uint8_t *input_data, uint64_t ifmap_size) {
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    input_data[i] = i * 3;
+  }
+}
+
+static void init_mask(uint8_t *mask, uint64_t ifmap_size) {
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    mask[i] = i % 2;
+  }
+}
+
+static void init_update_tbl(uint8_t *update_tbl, uint64_t ifmap_size) {
+  int8_t *update_tbl_i8 = (int8_t *)update_tbl;
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    update_tbl_i8[i] = i * (i % 2 ? -1 : 1);
+  }
+}
+
+static void init_ref(uint8_t *ref_data, uint64_t ofmap_size) {
+  for (uint64_t i = 0; i < ofmap_size; i++) {
+    ref_data[i] = -1 * i;
+    // ref_data[i] = 3 * i;
+  }
+}
+
+static void set_image_by_u8mask(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask,
+                                uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl,
+                                uint8_t threshold, uint8_t w1, uint8_t w2) {
+  (void)pY;
+  (void)g_update_tbl;
+  (void)threshold;
+  (void)w1;
+  (void)w2;
+
+  for (size_t i = 0; i < ifmap_size; i++) {
+    if (mask[i]) {
+      ref_data[i] = pNewY[i];
+    }
+  }
+}
+
+static void set_image_by_two_info_i8(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask,
+                                     uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl,
+                                     uint8_t threshold, uint8_t w1, uint8_t w2) {
+  (void)pY;
+  (void)w1;
+  (void)w2;
+  int8_t *g_update_tbl_i8 = (int8_t *)g_update_tbl;
+
+  for (size_t i = 0; i < ifmap_size; i++) {
+    if (mask[i] && (g_update_tbl_i8[i] < threshold)) {
+      ref_data[i] = pNewY[i];
+    }
+  }
+}
+
+static void gen_image_diff(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask, uint8_t *pNewY,
+                           uint8_t *pY, uint8_t *g_update_tbl, uint8_t threshold, uint8_t w1,
+                           uint8_t w2) {
+  (void)mask;
+  (void)w1;
+  (void)w2;
+  (void)g_update_tbl;
+  (void)threshold;
+
+  for (size_t i = 0; i < ifmap_size; i++) {
+    ref_data[i] = abs(pNewY[i] - pY[i]);
+  }
+}
+
+static void update_tbl_by_threshold(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask,
+                                    uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl,
+                                    uint8_t threshold, uint8_t w1, uint8_t w2) {
+  (void)pNewY;
+  (void)pY;
+  (void)g_update_tbl;
+  (void)mask;
+  (void)w2;
+  int8_t *ref_data_i8 = (int8_t *)ref_data;  // output is i8
+
+  for (size_t i = 0; i < ifmap_size; i++) {
+    mask[i] = 0;
+  }
+
+  for (size_t i = 0; i < ifmap_size; i++) {
+    int8_t old = ref_data_i8[i];
+    if (g_update_tbl[i] < threshold) {
+      ref_data_i8[i] = (ref_data_i8[i] < w1) ? 0 : (ref_data_i8[i] - 1);
+    } else {
+      if (old != 127) {
+        // saturate it
+        ref_data_i8[i]++;
+      }
+      mask[i] = 1;
+    }
+  }
+}
+
+static void set_image_by_two_info_u8(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask,
+                                     uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl,
+                                     uint8_t threshold, uint8_t w1, uint8_t w2) {
+  (void)pY;
+  (void)mask;
+  (void)w1;
+  (void)w2;
+  // int8_t* g_update_tbl_i8 = (int8_t*)g_update_tbl;
+
+  for (size_t i = 0; i < ifmap_size; i++) {
+    if (g_update_tbl[i] >= threshold) {
+      ref_data[i] = pNewY[i];
+    }
+  }
+}
+
+static void blend_image_by_tbl(uint8_t *ref_data, uint64_t ifmap_size, uint8_t *mask,
+                               uint8_t *pNewY, uint8_t *pY, uint8_t *g_update_tbl,
+                               uint8_t threshold, uint8_t w1, uint8_t w2) {
+  (void)mask;
+  (void)pY;
+  int8_t *g_update_tbl_i8 = (int8_t *)g_update_tbl;
+  for (size_t i = 0; i < ifmap_size; i++) {
+    if (g_update_tbl_i8[i] > threshold) {
+      ref_data[i] = (w1 * ref_data[i] + w2 * pNewY[i]) >> 8;
+    }
+  }
+}
+
+static int _cvm_set_image_by_u8mask(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
+                                    cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
+                                    cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, uint8_t threshold,
+                                    uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap) {
+  (void)tl_ifmap2;
+  (void)tl_update_tbl;
+  (void)threshold;
+  (void)w1;
+  (void)w2;
+  (void)tl_kernel;
+  (void)tl_bias;
+  (void)tl_buf;
+
+  return cvm_set_image_by_u8mask(ctx, tl_ifmap, tl_buf, tl_mask, tl_ofmap);
+}
+
+static int _cvm_set_image_by_u8mask_dp(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
+                                       cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
+                                       cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, uint8_t threshold,
+                                       uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap) {
+  (void)tl_ifmap2;
+  (void)tl_update_tbl;
+  (void)threshold;
+  (void)w1;
+  (void)w2;
+  (void)tl_kernel;
+  (void)tl_bias;
+  (void)tl_buf;
+
+  return cvm_set_image_by_u8mask_dp(ctx, tl_ifmap, tl_mask, tl_kernel, tl_bias, tl_ofmap);
+}
+
+static int _cvm_set_image_by_two_info_i8(cvk_context_t *ctx, cvk_tl_t *tl_ifmap,
+                                         cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf, cvk_tl_t *tl_mask,
+                                         cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_kernel,
+                                         cvk_tl_t *tl_bias, uint8_t threshold, uint8_t w1,
+                                         uint8_t w2, cvk_tl_t *tl_ofmap) {
+  (void)tl_ifmap2;
+  (void)threshold;
+  (void)w1;
+  (void)w2;
+  (void)tl_kernel;
+  (void)tl_bias;
+
+  // tl_ifmap2 as buf
+  return cvm_set_image_by_two_info_i8(ctx, tl_ifmap, tl_buf, tl_mask, tl_update_tbl, threshold,
+                                      tl_ofmap);
+}
+
+static int _cvm_set_image_by_two_info_i8_dp(cvk_context_t *ctx, cvk_tl_t *tl_ifmap,
+                                            cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf,
+                                            cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
+                                            cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias,
+                                            uint8_t threshold, uint8_t w1, uint8_t w2,
+                                            cvk_tl_t *tl_ofmap) {
+  (void)tl_ifmap2;
+  (void)threshold;
+  (void)w1;
+  (void)w2;
+  (void)tl_kernel;
+  (void)threshold;
+  (void)tl_buf;
+
+  return cvm_set_image_by_two_info_i8_dp(ctx, tl_ifmap, tl_kernel, tl_mask, tl_update_tbl, tl_bias,
+                                         tl_ofmap);
+}
+
+static int _cvm_gen_image_diff(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
+                               cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
+                               cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, uint8_t threshold,
+                               uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap) {
+  (void)tl_mask;
+  (void)tl_buf;
+  (void)tl_update_tbl;
+  (void)threshold;
+  (void)w1;
+  (void)w2;
+  (void)tl_kernel;
+  (void)tl_bias;
+
+  // tl_mask as buffer
+  return cvm_gen_image_diff(ctx, tl_ifmap, tl_ifmap2, tl_mask, tl_buf, tl_ofmap);
+}
+
+static int _cvm_update_tbl_by_threshold(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
+                                        cvk_tl_t *tl_buf, cvk_tl_t *tl_mask,
+                                        cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_kernel,
+                                        cvk_tl_t *tl_bias, uint8_t threshold, uint8_t w1,
+                                        uint8_t w2, cvk_tl_t *tl_ofmap) {
+  (void)w2;
+  (void)tl_kernel;
+  (void)tl_bias;
+
+  // w1 as threshold_b, tl_ifmap/tl_ifmap2 as buf
+  return cvm_update_tbl_by_threshold(ctx, tl_mask, tl_ifmap, tl_ifmap2, tl_buf, tl_update_tbl,
+                                     threshold, w1, tl_ofmap);
+}
+
+static int _cvm_set_image_by_two_info_u8(cvk_context_t *ctx, cvk_tl_t *tl_ifmap,
+                                         cvk_tl_t *tl_ifmap2, cvk_tl_t *tl_buf, cvk_tl_t *tl_mask,
+                                         cvk_tl_t *tl_update_tbl, cvk_tl_t *tl_kernel,
+                                         cvk_tl_t *tl_bias, uint8_t threshold, uint8_t w1,
+                                         uint8_t w2, cvk_tl_t *tl_ofmap) {
+  (void)tl_ifmap2;
+  (void)tl_mask;
+  (void)w1;
+  (void)w2;
+  (void)tl_kernel;
+  (void)tl_bias;
+
+  // tl_ifmap2 as buf
+  return cvm_set_image_by_two_info_u8(ctx, tl_ifmap, tl_ifmap2, tl_buf, tl_update_tbl, threshold,
+                                      tl_ofmap);
+}
+
+static int _cvm_blend_image_by_tbl(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
+                                   cvk_tl_t *tl_buf, cvk_tl_t *tl_mask, cvk_tl_t *tl_update_tbl,
+                                   cvk_tl_t *tl_kernel, cvk_tl_t *tl_bias, uint8_t threshold,
+                                   uint8_t w1, uint8_t w2, cvk_tl_t *tl_ofmap) {
+  (void)tl_ifmap2;
+  (void)tl_kernel;
+  (void)tl_bias;
+  // tl_mask as buf
+  return cvm_blend_image_by_tbl(ctx, tl_ifmap, tl_mask, tl_buf, tl_update_tbl, threshold, w1, w2,
+                                tl_ofmap);
+}
+
+struct testbench testbenchs[] = {
+    {(char *)"set_image_by_two_info_i8_dp", _cvm_set_image_by_two_info_i8_dp,
+     set_image_by_two_info_i8, 2, 2, 3},
+    {(char *)"set_image_by_u8mask_dp", _cvm_set_image_by_u8mask_dp, set_image_by_u8mask, 10, 2, 3},
+
+    {(char *)"set_image_by_u8mask", _cvm_set_image_by_u8mask, set_image_by_u8mask, 10, 2, 3},
+    {(char *)"set_image_by_two_info_i8", _cvm_set_image_by_two_info_i8, set_image_by_two_info_i8, 2,
+     2, 3},
+    {(char *)"update_tbl_by_threshold", _cvm_update_tbl_by_threshold, update_tbl_by_threshold, 15,
+     12, 3},
+    {(char *)"gen_image_diff", _cvm_gen_image_diff, gen_image_diff, 10, 2, 3},
+    {(char *)"set_image_by_two_info_u8", _cvm_set_image_by_two_info_u8, set_image_by_two_info_u8,
+     40, 2, 3},
+    {(char *)"blend_image_by_tbl", _cvm_blend_image_by_tbl, blend_image_by_tbl, 6, 2, 3},
+};
+
+static void load(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap2,
+                 uint8_t *input_ifmap2, cvk_tl_t *tl_ifmap3, uint8_t *input_ifmap3,
+                 cvk_tl_t *tl_ofmap, uint8_t *input_ofmap, cvk_tl_t *tl_mask, uint8_t *input_mask,
+                 cvk_tl_t *tl_update_tbl, uint8_t *input_update_tbl) {
+  // send device memory to sram
+  test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ifmap2, input_ifmap2);
+  test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ifmap3, input_ifmap3);
+  test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_mask, input_mask);
+  test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_update_tbl, input_update_tbl);
+  test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ofmap, input_ofmap);
+}
+
+static void store(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, char *name, cvk_tl_t *tl_ofmap,
+                  uint8_t *output_ofmap, cvk_tl_t *tl_mask, uint8_t *output_mask, int sz) {
+  uint8_t *ofmap_data = (uint8_t *)test_get_tensor_l2g_comp(rt_ctx, cvk_ctx, tl_ofmap);
+
+  // NOTICE: heavy copy
+  memcpy(output_ofmap, ofmap_data, sz);
+
+  free(ofmap_data);
+
+  if (!strcmp(name, "update_tbl_by_threshold")) {
+    uint8_t *mask_data = (uint8_t *)test_get_tensor_l2g_comp(rt_ctx, cvk_ctx, tl_mask);
+    memcpy(output_mask, mask_data, sz);
+    free(mask_data);
+  }
+}
+
+static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, cvk_tg_shape_t *tg_shape,
+                      int testcase_idx, int is_pingpong = false) {
+  // for calculate size we need in host
+  cvk_tl_shape_t ifmap_shape = {tg_shape->n, tg_shape->c, tg_shape->h, tg_shape->w};
+  cvk_tl_shape_t ofmap_shape = ifmap_shape;
+
+  uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
+  uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
+
+  // unit size is 1 bytes
+  int data_type_size = 1;
+
+  // get input/output size
+  uint64_t ifmap_bytesize = ifmap_size * data_type_size;
+  uint64_t ofmap_bytesize = ofmap_size * data_type_size;
+
+  // alloc on ddr
+  // uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize);
+  uint8_t *input_data2 = (uint8_t *)xmalloc(ifmap_bytesize);
+  uint8_t *input_data3 = (uint8_t *)xmalloc(ifmap_bytesize);
+  uint8_t *mask = (uint8_t *)xmalloc(ifmap_bytesize);
+  uint8_t *update_tbl = (uint8_t *)xmalloc(ifmap_bytesize);
+  uint8_t *_ref_data = (uint8_t *)xmalloc(ofmap_bytesize);
+  uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize);
+  uint8_t *tpu_output_data = (uint8_t *)xmalloc(ofmap_bytesize);
+  uint8_t *tpu_output_mask = (uint8_t *)xmalloc(ofmap_bytesize);
+
+  // init input / output data in ddr
+  uint8_t threshold, w1, w2;
+  threshold = testbenchs[testcase_idx].threshold;
+  w1 = testbenchs[testcase_idx].w1;
+  w2 = testbenchs[testcase_idx].w2;
+  init_input_2(input_data2, ifmap_size);
+  init_input_3(input_data3, ifmap_size);
+  // init_input(input_data2, ifmap_size);
+  // init_input(input_data3, ifmap_size);
+  init_mask(mask, ifmap_size);
+  init_update_tbl(update_tbl, ifmap_size);
+  init_ref(ref_data, ofmap_size);
+
+  // keep org output
+  memcpy(_ref_data, ref_data, ofmap_bytesize);
+
+  testbenchs[testcase_idx].ref(ref_data, ofmap_size, mask, input_data2, input_data3, update_tbl,
+                               threshold, w1, w2);
+
+  int tiles = std::ceil(ifmap_shape.c / (float)cvk_ctx->info.npu_num);
+
+  ifmap_shape.c = ifmap_shape.c / tiles;
+
+  cvk_tl_shape_t kernel_shape = ifmap_shape;
+  kernel_shape.h = 1;
+  kernel_shape.w = 1;
+
+  cvk_tl_shape_t bias_shape = ifmap_shape;
+  bias_shape.h = 1;
+  bias_shape.w = 1;
+  bias_shape.n = 2;
+
+  uint64_t kernel_size = tl_shape_size(&kernel_shape);
+  uint64_t bias_size = tl_shape_size(&bias_shape);
+  uint64_t kernel_bytesize = kernel_size * data_type_size;
+  uint64_t bias_bytesize = bias_size * data_type_size;
+  uint8_t *kernel_data = (uint8_t *)xmalloc(kernel_bytesize);
+  uint8_t *bias_data = (uint8_t *)xmalloc(bias_bytesize);
+
+  // NOTICE: must init with it
+  init_kernel(kernel_data, kernel_size, -1);
+  init_bias(bias_data, bias_size, 1);
+
+  if (!strcmp(testbenchs[testcase_idx].name, "set_image_by_two_info_i8_dp")) {
+    init_kernel(kernel_data, kernel_size, 1);
+    init_bias(bias_data, bias_size, -1 * threshold);
+  }
+
+  if (is_pingpong) {
+    // quirk that we tile h for easy implemenetation
+    ifmap_shape.h /= 2;
+    tiles *= 2;
+  }
+
+  // sync input/output
+  ofmap_shape = ifmap_shape;
+
+  // NOTICE: dont care batch
+  int shape_sz = ifmap_shape.c * ifmap_shape.h * ifmap_shape.w;
+
+  // alloc on sram, just once
+  cvk_fmt_t fmt = CVK_FMT_U8;  // for mac used
+  int eu_align = 1;            // dont care
+  cvk_tl_t *tl_ifmap2[2] = {NULL, NULL};
+  cvk_tl_t *tl_ifmap3[2] = {NULL, NULL};
+  cvk_tl_t *tl_ofmap[2] = {NULL, NULL};
+  cvk_tl_t *tl_mask[2] = {NULL, NULL};
+  cvk_tl_t *tl_update_tbl[2] = {NULL, NULL};
+  // must place last for high part of 'mac'
+  cvk_tl_t *tl_buf[2] = {NULL, NULL};
+  cvk_tl_t *tl_kernel, *tl_bias;
+
+  // alloc sram
+  tl_kernel = test_alloc_tl(cvk_ctx, kernel_shape, CVK_FMT_I8, eu_align);
+  tl_bias = test_alloc_tl(cvk_ctx, bias_shape, CVK_FMT_I8, /*eu_align=*/0);
+
+  int alloc_nr = is_pingpong ? 2 : 1;
+  for (int i = 0; i < alloc_nr; i++) {
+    tl_ifmap2[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
+    tl_ifmap3[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
+    tl_ofmap[i] = test_alloc_tl(cvk_ctx, ofmap_shape, fmt, eu_align);
+    tl_mask[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
+    tl_update_tbl[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
+    // must place last for high part of 'mac'
+    tl_buf[i] = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
+  }
+
+  // NOTICE: consider residual
+  int load_offset = 0;
+  int store_offset = 0;
+  int ret;
+  int curr = flip;
+  long elapsed;
+  struct timeval t0, t1;
+  gettimeofday(&t0, NULL);
+
+  if (!is_pingpong) {
+    int off = 0;
+    for (int i = 0; i < tiles; i++) {
+      // NOTICE: load each loop
+      test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_kernel, kernel_data);
+      test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_bias, bias_data);
+
+      load(rt_ctx, cvk_ctx, tl_ifmap2[curr], input_data2 + off, tl_ifmap3[curr], input_data3 + off,
+           tl_ofmap[curr], _ref_data + off, tl_mask[curr], mask + off, tl_update_tbl[curr],
+           update_tbl + off);
+
+      int ret = testbenchs[testcase_idx].cvm_run(
+          cvk_ctx, tl_ifmap2[curr], tl_ifmap3[curr], tl_buf[curr], tl_mask[curr],
+          tl_update_tbl[curr], tl_kernel, tl_bias, threshold, w1, w2, tl_ofmap[curr]);
+
+      if (ret) {
+        fflush(stderr);
+        printf("%s", "generate commands fail, return\n");
+        exit(-1);
+      }
+
+      store(rt_ctx, cvk_ctx, testbenchs[testcase_idx].name, tl_ofmap[curr], tpu_output_data + off,
+            tl_mask[curr], tpu_output_mask + off, shape_sz);
+
+      off += shape_sz;
+    }
+  } else {
+    // TODO: not load at once
+    int operand_num = 1;
+    int input_flip = 0;
+    int output_flip = 0;
+    for (int i = 0; i < tiles + 2; i++) {
+      cvk_ctx->ops->parallel_enable(cvk_ctx);
+      // NOTICE: load each loop
+      test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_kernel, kernel_data);
+      test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_bias, bias_data);
+
+      // send device memory to sram
+      if ((i - 2) >= 0 && (i - 2) % operand_num == operand_num - 1) {
+        int curr = 1 - output_flip;
+        store(rt_ctx, cvk_ctx, testbenchs[testcase_idx].name, tl_ofmap[curr],
+              tpu_output_data + store_offset, tl_mask[curr], tpu_output_mask + store_offset,
+              shape_sz);
+        store_offset += shape_sz;
+      }
+
+      if (i - 1 >= 0 && i - 1 < tiles) {
+        // get data from tl
+        int curr = 1 - input_flip;
+        // prepare command buffer
+        ret = testbenchs[testcase_idx].cvm_run(
+            cvk_ctx, tl_ifmap2[curr], tl_ifmap3[curr], tl_buf[curr], tl_mask[curr],
+            tl_update_tbl[curr], tl_kernel, tl_bias, threshold, w1, w2, tl_ofmap[curr]);
+
+        if (ret) {
+          fflush(stderr);
+          printf("%s", "generate commands fail, return\n");
+          exit(-1);
+        }
+        output_flip = 1 - output_flip;
+      }
+
+      if (i < tiles) {
+        load(rt_ctx, cvk_ctx, tl_ifmap2[input_flip], input_data2 + load_offset,
+             tl_ifmap3[input_flip], input_data3 + load_offset, tl_ofmap[input_flip],
+             _ref_data + load_offset, tl_mask[input_flip], mask + load_offset,
+             tl_update_tbl[input_flip], update_tbl + load_offset);
+        load_offset += shape_sz;
+        input_flip = 1 - input_flip;
+      }
+      cvk_ctx->ops->parallel_disable(cvk_ctx);
+    }
+  }
+
+  // submit descriptor
+  test_submit_comp(rt_ctx, cvk_ctx);
+
+  gettimeofday(&t1, NULL);
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+
+  // compare with reference with byte
+  debug_print("%s comparing...", testbenchs[testcase_idx].name);
+  for (uint32_t i = 0; i < (uint32_t)ofmap_bytesize; i++) {
+    if (ref_data[i] != tpu_output_data[i]) {
+      debug_print("comparing failed output[%u] got %u, ref %u\n", i, tpu_output_data[i],
+                  ref_data[i]);
+      // fail case
+      fflush(stderr);
+      exit(-1);
+    }
+  }
+
+  // compare another export information
+  if (!strcmp(testbenchs[testcase_idx].name, "update_tbl_by_threshold")) {
+    for (uint32_t i = 0; i < (uint32_t)shape_sz; i++) {
+      if (mask[i] != tpu_output_mask[i]) {
+        debug_print("comparing mask failed output[%u] got %u, ref %u\n", i, tpu_output_mask[i],
+                    mask[i]);
+        // fail case
+        fflush(stderr);
+        exit(-1);
+      }
+    }
+  }
+
+  if (tiles == 1) {
+    debug_print("%s", " pass\n");
+  } else {
+    // get elapsed time
+    debug_print("(takes %ld us)\n", elapsed);
+  }
+
+  // free resource from tpu memory
+  for (int i = alloc_nr - 1; i >= 0; --i) {
+    free_tl(cvk_ctx, tl_buf[i]);
+    free_tl(cvk_ctx, tl_update_tbl[i]);
+    free_tl(cvk_ctx, tl_mask[i]);
+    free_tl(cvk_ctx, tl_ofmap[i]);
+    free_tl(cvk_ctx, tl_ifmap3[i]);
+    free_tl(cvk_ctx, tl_ifmap2[i]);
+  }
+  free_tl(cvk_ctx, tl_bias);
+  free_tl(cvk_ctx, tl_kernel);
+
+  // free resource from host memory
+  // free(input_data);
+  free(ref_data);
+  free(tpu_output_data);
+  free(tpu_output_mask);
+  free(input_data2);
+  free(input_data3);
+  free(mask);
+  free(update_tbl);
+  free(_ref_data);
+  free(kernel_data);
+  free(bias_data);
+}
+
+int main() {
+  CVI_RT_HANDLE rt_ctx;
+  cvk_context_t *cvk_ctx;
+
+  // init runtime / kerenl structure
+  test_init(&rt_ctx, &cvk_ctx);
+
+  cvk_tg_shape_t tg_shape = {1, 20, 3, 4};
+
+  // run test
+  int testbench_nr = sizeof(testbenchs) / sizeof(testbenchs[0]);
+
+  for (int i = 0; i < testbench_nr; i++) {
+    testbench(&rt_ctx, cvk_ctx, &tg_shape, i);
+  }
+#if 1
+
+  // run test without ping-pong
+  tg_shape = {1, 128, 340, 16};
+
+  printf("[heavy data] w/o ping pong\n");
+
+  // NOTICE: only check c
+  int tiles = std::ceil(tg_shape.c / (float)cvk_ctx->info.npu_num);
+  if (tg_shape.c > cvk_ctx->info.npu_num) {
+    debug_print("tile nr %d channel base one npu nr %d\n", tiles, cvk_ctx->info.npu_num);
+  }
+
+  for (int i = 0; i < testbench_nr; i++) {
+    testbench(&rt_ctx, cvk_ctx, &tg_shape, i);
+  }
+
+  tg_shape = {1, 128, 340, 16};
+  printf("[heavy data] w/ ping pong\n");
+  for (int i = 0; i < testbench_nr; i++) {
+    testbench(&rt_ctx, cvk_ctx, &tg_shape, i, /*is_pingpong=*/true);
+  }
+#endif
+  // de-init runtime / kerenl structure
+  test_exit(&rt_ctx, cvk_ctx);
+
+  printf("all pass\n");
+
+  return 0;
+}
diff --git a/cvimath/sample/sample_sigmoid_linear_interp.cpp b/cvimath/sample/sample_sigmoid_linear_interp.cpp
new file mode 100644
index 000000000..74db96cf8
--- /dev/null
+++ b/cvimath/sample/sample_sigmoid_linear_interp.cpp
@@ -0,0 +1,165 @@
+// \file implement activation function(sigmoid) by interpolation lookup table,
+// please refer [here](https://en.wikipedia.org/wiki/Linear_interpolation) for more details
+
+// header include
+#include <assert.h>
+#include <cvimath_internal.h>     // math
+#include <test_cvikernel_util.h>  // kerenl
+
+// ========== user config ============
+#define MAX_ERROR (0.004)  // tolerance
+// for current example, we quauntize data to -8 ~ +8
+// range depend on ur activation
+static int range_start = -8;
+static int range_end = 8;
+// ========== end of user config ============
+
+// gen reference by cpu
+static double _gen_sigmoid(float x) { return 1.0 / (1.0 + exp(-(x))); }
+
+// gen reference
+static void gen_ref(uint16_t *ofmap, uint16_t *ifmap, cvk_tl_shape_t ifmap_shape) {
+  for (uint64_t i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    ofmap[i] = convert_fp32_bf16(_gen_sigmoid(convert_bf16_fp32(ifmap[i])));
+  }
+}
+
+// verify cpu data with tpu
+static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint64_t ofmap_size) {
+  int count = 0;
+  uint64_t size = ofmap_size;
+
+  for (uint64_t i = 0; i < size; i++) {
+    float got = convert_bf16_fp32(ofmap_data[i]);
+    float exp = convert_bf16_fp32(ref_data[i]);
+
+    if (fabs(got - exp) > MAX_ERROR) {
+      fprintf(stderr,
+              "[%d] comparing failed at ofmap_data[%u], got %x, exp %x, "
+              "diff(%f - %f) is %f\n",
+              count, (uint32_t)i, ofmap_data[i], ref_data[i], got, exp, fabs(got - exp));
+      count++;
+    }
+  }
+
+  // exit if fail
+  if (count != 0) {
+    printf("error count is %d\n", count);
+    exit(-1);
+  }
+
+  return true;
+}
+
+// gen random input for test
+static void gen_input(uint16_t *ifmap, uint64_t ifmap_size) {
+  int table_hw = 256;
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    // input range is -8 ~ +8
+    float input = ((int)i % 7) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
+    ifmap[i] = convert_fp32_bf16(input);
+  }
+}
+
+// main code for test sigmoid interpolate implement by lookup table
+static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) {
+  // example for input tensor
+  cvk_tl_shape_t ifmap_shape = {1, 32, 16, 16};
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+
+  // get table / input shape
+  cvk_tl_shape_t table_shape;
+  cvm_table_shape(bmk, &table_shape);
+  cvk_tl_shape_t ofmap_shape = ifmap_shape;
+
+  uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
+  uint64_t table_size = tl_shape_size(&table_shape);
+  uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
+
+  // get table/input size
+  int data_type_size = 1;
+  if (fmt == CVK_FMT_BF16) {
+    // bf16 takes 2 bytes
+    data_type_size = 2;
+  }
+
+  uint64_t ifmap_bytesize = ifmap_size * data_type_size;
+  uint64_t table_bytesize = table_size * data_type_size;
+  uint64_t ofmap_bytesize = ofmap_size * data_type_size;
+
+  // alloc host memory
+  uint16_t *ifmap = (uint16_t *)xmalloc(ifmap_bytesize);
+  uint16_t *table_data = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *table_data_slope = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
+
+  // gen input and assign data in host
+  gen_input(ifmap, ifmap_size);
+
+  // gen table, interpolation need 2 tables, one for lookup, another one is slope
+  cvm_sigmoid_tbl(table_data, table_data_slope, &table_shape, range_start, range_end);
+
+  // gen reference
+  gen_ref(ref_data, ifmap, ofmap_shape);
+
+  // alloc input / output / tmp / lookup table / slope table
+  cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *cvk_tl_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *cvk_tl_table_answer_slope = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
+
+  // device memory load to local memory
+  test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)ifmap);
+  test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer, (uint8_t *)table_data);
+  test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer_slope, (uint8_t *)table_data_slope);
+
+  // get quantize(scale) value
+  float scale = cvm_sigmoid_scale(range_start, range_end);
+
+  // emit core function
+  cvm_emit_sigmoid(bmk, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_slope,
+                   tl_ofmap_bf16, scale);
+
+  // get result from device to host
+  uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, tl_ofmap_bf16);
+
+  // verify data with tolerance
+  verify(ofmap_data, ref_data, ofmap_size);
+
+  // release device memory in revert order
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, cvk_tl_table_answer_slope);
+  free_tl(bmk, cvk_tl_table_answer);
+  free_tl(bmk, tl_ifmap);
+
+  // release host memory
+  free(ifmap);
+  free(table_data);
+  free(table_data_slope);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main() {
+  CVI_RT_HANDLE ctx;
+  cvk_context_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  // init runtime / kerenl structure
+  test_init(&ctx, &bmk);
+
+  // emit test case
+  testbench(&ctx, bmk);
+
+  // de-init runtime / kerenl structure
+  test_exit(&ctx, bmk);
+
+  // restore rounding
+  restore_feround(round_mode);
+
+  return 0;
+}
diff --git a/cvimath/sample/sample_upsample.cpp b/cvimath/sample/sample_upsample.cpp
new file mode 100644
index 000000000..c101967da
--- /dev/null
+++ b/cvimath/sample/sample_upsample.cpp
@@ -0,0 +1,145 @@
+// \file sample for set value by mask, plz refer \cvimath_internal.h for more details
+
+// header include
+#include <assert.h>
+#include <cvimath_internal.h>     // math
+#include <test_cvikernel_util.h>  // kerenl
+
+static void init_input(uint8_t *input_data, uint64_t ifmap_size) {
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    input_data[i] = i;
+  }
+}
+
+static void init_weight(uint8_t *weight_data, uint64_t weight_size) {
+  for (uint64_t i = 0; i < weight_size; i++) {
+    weight_data[i] = 1;  // NOTICE: MUST init as 1 under nearest upsample case
+  }
+}
+
+static int init_ref(uint8_t *input, uint8_t *output, int n, int c, int ih, int iw, int scale_h,
+                    int scale_w) {
+  int h = ih * scale_h;
+  int w = iw * scale_w;
+  for (int ni = 0; ni < n; ni++) {
+    for (int ci = 0; ci < c; ci++) {
+      for (int hi = 0; hi < h; hi++) {
+        for (int wi = 0; wi < w; wi++) {
+          int nwi = wi / scale_w;
+          int nhi = hi / scale_h;
+          int out_idx = (((ni * c + ci) * h) + hi) * w + wi;
+          int in_idx = (((ni * c + ci) * (h / scale_h)) + nhi) * (w / scale_w) + nwi;
+          output[out_idx] = input[in_idx];
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx, cvk_tg_shape_t *tg_shape) {
+  // for calculate size we need in host
+  cvk_tl_shape_t ifmap_shape = {tg_shape->n, tg_shape->c, tg_shape->h, tg_shape->w};
+
+  // upsample scale, e.g: scale_h = 3,scale_w 2, input h = 4, input w = 5
+  // output h is 4 * 3 = 12, output w is 5 * 2 = 10 with nearest
+
+  int scale_h = 3;
+  int scale_w = 2;
+
+  // set output shape
+  cvk_tl_shape_t ofmap_shape = ifmap_shape;
+  ofmap_shape.h = ofmap_shape.h * scale_h;
+  ofmap_shape.w = ofmap_shape.w * scale_w;
+
+  cvk_tl_shape_t weight_shape = ifmap_shape;
+  weight_shape.h = scale_h;
+  weight_shape.w = scale_w;
+
+  uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
+  uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
+  uint64_t weight_size = tl_shape_size(&weight_shape);
+
+  // unit size is 1 bytes for int/uint 8
+  int data_type_size = 1;
+
+  // get input/output size
+  uint64_t ifmap_bytesize = ifmap_size * data_type_size;
+  uint64_t ofmap_bytesize = ofmap_size * data_type_size;
+  uint64_t weight_bytesize = weight_size * data_type_size;
+
+  // alloc on ddr
+  // uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize);
+  uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize);
+  uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize);
+  uint8_t *weight_data = (uint8_t *)xmalloc(weight_bytesize);
+
+  // init input / output data in ddr
+  init_input(input_data, ifmap_size);
+  init_weight(weight_data, weight_bytesize);  // fix pattern
+  init_ref(input_data, ref_data, ifmap_shape.n, ifmap_shape.c, ifmap_shape.h, ifmap_shape.w,
+           scale_h, scale_w);
+
+  // alloc on sram
+  cvk_fmt_t fmt = CVK_FMT_I8;
+  int eu_align = 1;
+  cvk_tl_t *tl_ifmap = test_alloc_tl(cvk_ctx, ifmap_shape, fmt, eu_align);
+  cvk_tl_t *tl_weight = test_alloc_tl(cvk_ctx, weight_shape, fmt, eu_align);
+  cvk_tl_t *tl_ofmap = test_alloc_tl(cvk_ctx, ofmap_shape, fmt, eu_align);
+
+  // send device memory to sram
+  test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_ifmap, input_data);
+  test_put_tensor_g2l_comp(rt_ctx, cvk_ctx, tl_weight, weight_data);
+
+  // generate descriptor
+  cvm_upsample2d(cvk_ctx, tl_ifmap, tl_weight, tl_ofmap);
+
+  // submit descriptor
+  test_submit_comp(rt_ctx, cvk_ctx);
+
+  // get data from tl
+  uint8_t *ofmap_data = (uint8_t *)test_get_tensor_l2g_comp(rt_ctx, cvk_ctx, tl_ofmap);
+
+  // compare with reference with byte
+  for (uint32_t i = 0; i < ofmap_size; i++) {
+    if (ref_data[i] != ofmap_data[i]) {
+      fprintf(stderr, "comparing failed output[%u] got %u, ref %u\n", i, ofmap_data[i],
+              ref_data[i]);
+      // fail case
+      fflush(stderr);
+      exit(-1);
+    }
+  }
+
+  // free resource from tpu memory
+  free_tl(cvk_ctx, tl_ofmap);
+  free_tl(cvk_ctx, tl_weight);
+  free_tl(cvk_ctx, tl_ifmap);
+
+  // free resource from host memory
+  free(ref_data);
+  free(weight_data);
+  free(ofmap_data);
+  free(input_data);
+}
+
+int main() {
+  CVI_RT_HANDLE rt_ctx;
+  cvk_context_t *cvk_ctx;
+
+  // init runtime / kerenl structure
+  test_init(&rt_ctx, &cvk_ctx);
+
+  cvk_tg_shape_t tg_shape = {1, 20, 3, 4};
+  // cvk_tg_shape_t tg_shape = {1, 20, 3, 40};
+
+  // run test
+  testbench(&rt_ctx, cvk_ctx, &tg_shape);
+
+  // de-init runtime / kerenl structure
+  test_exit(&rt_ctx, cvk_ctx);
+
+  printf("pass\n");
+
+  return 0;
+}
diff --git a/cvimath/src/1880v2_fp_convert.c b/cvimath/src/1880v2_fp_convert.c
new file mode 100644
index 000000000..f750f11df
--- /dev/null
+++ b/cvimath/src/1880v2_fp_convert.c
@@ -0,0 +1,293 @@
+#ifndef ATOMIC_FP_H_
+#define ATOMIC_FP_H_
+
+#if __arm__
+#define __DISABLE_FENV__
+#endif
+
+#ifndef __DISABLE_FENV__
+#include <fenv.h>
+#endif
+#include <math.h>
+#include <stdint.h>  // uint8_t / uint16_t
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+uint8_t convert_bf16_u8(uint16_t data);
+uint8_t _convert_bf16_u8(uint16_t data, int int8_rnd_md);
+int8_t _convert_bf16_s8(uint16_t data, int int8_rnd_md);
+int8_t convert_bf16_s8(uint16_t data);
+uint16_t convert_int8_bf16(uint8_t data, uint8_t sign);
+uint32_t convert_fp32_u32(float fp32);
+uint32_t convert_fp32_hex(float val);
+float convert_hex_fp32(uint32_t hval);
+
+float convert_bf16_fp32(uint16_t bf16);
+uint16_t convert_fp32_bf16(float fp32);
+
+void f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed,
+                 int int8_rnd_md);
+// void f32_integer(void *if32, void *o_integer,
+// 0 for 32 bit , 1 for 16 bit , 2 for 8 bit
+//                 int integer_size, int accumulate = 0, int int8_signed = 1, int int8_rnd_md = 0);
+
+union convert_type_float {
+  float fval;
+  uint16_t bf16[2];
+  uint32_t ival;
+};
+
+typedef union convert_type_float convert_int_float;
+static const uint16_t NAN_VALUE = 0x7FC0;
+
+// static int round_mode = 0;
+static uint8_t float_isnan(const float x) {
+  // return isnan(x);
+  return x != x;
+}
+
+int set_store_feround() {
+#ifndef __DISABLE_FENV__
+  int round_mode = fegetround();
+  fesetround(FE_TOWARDZERO);
+  return round_mode;
+#else
+  return 0;
+#endif
+}
+
+void restore_feround(int round_mode) {
+#ifndef __DISABLE_FENV__
+  fesetround(round_mode);
+#else
+  (void)round_mode;
+#endif
+}
+
+uint8_t _convert_bf16_u8(uint16_t data, int int8_rnd_md) {
+  /* convert bf16 to float32*/
+  float fp32;
+  convert_int_float convert_val;
+  fp32 = convert_bf16_fp32(data);
+  /* convert float32 to uint8_t*/
+  f32_integer((void *)&fp32, &convert_val.ival, 2, 0, 0, int8_rnd_md);
+  return (uint8_t)convert_val.ival;
+}
+
+uint8_t convert_bf16_u8(uint16_t data) { return (uint8_t)_convert_bf16_u8(data, 0); }
+
+int8_t _convert_bf16_s8(uint16_t data, int int8_rnd_md) {
+  /* convert bf16 to float32*/
+  float fp32;
+  convert_int_float convert_val;
+  fp32 = convert_bf16_fp32(data);
+  /* convert float32 to uint8_t*/
+  f32_integer((void *)&fp32, &convert_val.ival, 2, 0, 1, int8_rnd_md);
+  return (int8_t)convert_val.ival;
+}
+
+int8_t convert_bf16_s8(uint16_t data) { return (int8_t)_convert_bf16_s8(data, 0); }
+
+uint16_t convert_int8_bf16(uint8_t data, uint8_t sign) {
+  int32_t val = sign ? (int8_t)data : (uint8_t)data;
+  /* need to round to bf16 mode */
+  return convert_fp32_bf16((float)val);
+}
+
+uint16_t convert_fp32_bf16(float fp32) {
+  if (float_isnan(fp32)) return NAN_VALUE;
+  convert_int_float convert_val;
+  convert_val.fval = fp32;
+  uint32_t input = convert_val.ival;
+  uint32_t lsb = (input >> 16) & 1;
+  uint32_t rounding_bias = 0x7fff + lsb;
+  input += rounding_bias;
+  convert_val.bf16[1] = (uint16_t)(input >> 16);
+
+  /* HW behavior */
+  if ((convert_val.bf16[1] & 0x7f80) == 0x7f80) {
+    convert_val.bf16[1] = 0x7f7f;
+  }
+  return convert_val.bf16[1];
+}
+
+uint8_t convert_fp32_u8(float fp32) {
+  convert_int_float convert_val;
+  f32_integer((void *)&fp32, &convert_val.ival, 2, 0, 0, 0);
+  return (uint8_t)convert_val.ival;
+}
+
+int8_t convert_fp32_s8(float fp32) {
+  convert_int_float convert_val;
+  f32_integer((void *)&fp32, &convert_val.ival, 2, 0, 1, 0);
+  return (int8_t)convert_val.ival;
+}
+
+uint32_t convert_fp32_u32(float fp32) {
+  convert_int_float convert_val;
+  f32_integer((void *)&fp32, &convert_val.ival, 0, 0, 0, 0);
+  return (uint32_t)convert_val.ival;
+}
+
+int32_t convert_fp32_s32(float fp32) {
+  convert_int_float convert_val;
+  f32_integer((void *)&fp32, &convert_val.ival, 0, 0, 1, 0);
+  return (int32_t)convert_val.ival;
+}
+
+/* convert hex to float directly */
+float convert_hex_fp32(uint32_t hval) {
+  convert_int_float convert_val;
+  convert_val.ival = hval;
+  return convert_val.fval;
+}
+/* convert float to hex directly */
+uint32_t convert_fp32_hex(float val) {
+  convert_int_float convert_val;
+  convert_val.fval = val;
+  return convert_val.ival;
+}
+float convert_bf16_fp32(uint16_t bf16) {
+  convert_int_float convert_val;
+  convert_val.bf16[1] = bf16;
+  convert_val.bf16[0] = 0;
+  return convert_val.fval;
+}
+
+void flt2int_flt(float x, unsigned long long *integer_part, float *sub_part, uint8_t sign) {
+  convert_int_float work_x;
+  int level_code;
+  unsigned long tail_code;
+  work_x.fval = x;
+  level_code = ((work_x.ival >> 23) & 0xff) - 127;
+
+  // if the level code is negaive, the integer part of the float is zero
+  if (level_code < 0) {
+    *integer_part = 0;
+    *sub_part = x;
+  } else {
+    tail_code = (work_x.ival) & 0x7fffff;
+    tail_code = tail_code | 0x800000;
+
+    if (level_code < 23) {
+      tail_code >>= (23 - level_code);
+      *integer_part = tail_code;
+      work_x.ival &= 0xffffffff << (23 - level_code);
+      *sub_part = x - work_x.fval;
+    } else {
+      tail_code <<= (level_code - 23);
+      *integer_part = tail_code;
+      if (level_code > 30) {
+        *integer_part = 0x7fffffff;
+        if (sign) *integer_part = 0x800000000;
+      }
+      *sub_part = 0;
+    }
+  }
+}
+
+inline static int flt2int(float ifval, int int8_rnd_md) {
+  union {
+    float floatNum;
+    unsigned long intNum;
+  } tempIfval;
+  tempIfval.floatNum = ifval;
+  uint8_t isPositive = ((tempIfval.intNum & 0x80000000UL) == 0x80000000UL) ? 0 : 1;
+  float abs_fval = (!isPositive) ? -ifval : ifval;
+  double sub_part;
+  double integer;
+  unsigned long long integer_part;
+  // uint8_t   sign = !isPositive;
+  // flt2int_flt(abs_fval, &integer_part, &sub_part, sign);
+  sub_part = modf((double)abs_fval, &integer);
+  integer_part = (unsigned long long)integer;
+  if (!isPositive) {
+    unsigned long long result;
+    if (int8_rnd_md == 0) {  // round to nearest even
+      if (sub_part > 0.5f) {
+        result = integer_part + 1;
+      } else if (sub_part == 0.5f) {
+        if (integer_part & 0x1) {
+          result = integer_part + 1;
+        } else {
+          result = integer_part;
+        }
+      } else {
+        result = integer_part;
+      }
+    } else {  // round to zero
+      result = integer_part;
+    }
+    if (result > 0x80000000UL) {
+      result = 0x80000000UL;
+    }
+    return -result;
+  } else {
+    unsigned long long result;
+    if (int8_rnd_md == 0) {  // round to nearest even
+      if (sub_part > 0.5f) {
+        result = integer_part + 1;
+      } else if (sub_part == 0.5f) {
+        if (integer_part & 0x1) {
+          result = integer_part + 1;
+        } else {
+          result = integer_part;
+        }
+      } else {
+        result = integer_part;
+      }
+    } else {
+      result = integer_part;
+    }
+    if (result > 0x7fffffff) {
+      result = 0x7fffffff;
+    }
+    return result;
+  }
+}
+
+void f32_integer(void *if32, void *o_integer, int integer_size, int accumulate, int int8_signed,
+                 int int8_rnd_md) {
+  int i_tmp;
+  float *f_tmp;
+  f_tmp = (float *)if32;
+  i_tmp = flt2int(*f_tmp, int8_rnd_md);
+  int *o32 = (int *)o_integer;
+  int dst_f32 = *o32;
+  short *o16 = (short *)o_integer;
+  short dst_o16 = *o32;
+  char *o8 = (char *)o_integer;
+  char dst_o8 = *o8;
+
+  if (integer_size == 0) {
+    *o32 = i_tmp;
+  } else if (integer_size == 1) {
+    *o16 = i_tmp;
+  } else {
+    *o8 = i_tmp;
+    int min = (int8_signed) ? -128 : 0;
+    int max = (int8_signed) ? 127 : 255;
+    if (i_tmp < min) {
+      *o8 = min;
+    } else if (i_tmp > max) {
+      *o8 = max;
+    }
+    //*o8 = i_tmp;
+  }
+  if (accumulate) {
+    if (integer_size == 0) {
+      *o32 += dst_f32;
+    } else if (integer_size == 1) {
+      *o16 += dst_o16;
+    } else
+      *o8 += dst_o8;
+  }
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* ATOMIC_FP_H_ */
diff --git a/cvimath/src/CMakeLists.txt b/cvimath/src/CMakeLists.txt
new file mode 100644
index 000000000..72ba03147
--- /dev/null
+++ b/cvimath/src/CMakeLists.txt
@@ -0,0 +1,12 @@
+project(cvimath)
+
+include_directories(${CMAKE_SOURCE_DIR}/include)
+file(GLOB SRC ./*.c ./*.cpp)
+
+add_library(${PROJECT_NAME} SHARED ${SRC})
+target_link_libraries(${PROJECT_NAME} ${TPU_KERNEL_LIB})
+install(TARGETS ${PROJECT_NAME} DESTINATION lib)
+
+add_library(${PROJECT_NAME}-static STATIC ${SRC})
+target_link_libraries(${PROJECT_NAME}-static ${TPU_KERNEL_LIB})
+install(TARGETS ${PROJECT_NAME}-static DESTINATION lib)
diff --git a/cvimath/src/bf16_gemm.c b/cvimath/src/bf16_gemm.c
new file mode 100644
index 000000000..aeb8e9255
--- /dev/null
+++ b/cvimath/src/bf16_gemm.c
@@ -0,0 +1,1361 @@
+#include <cvimath_internal.h>
+#include "gen_lut.h"  // NOLINT
+
+#define DEBUG_TYPE "bmnet_bf16_fc_kernel"
+
+#define RELU 0
+#define PRELU 1
+
+#define ENABLE_DBG
+#ifdef ENABLE_DBG
+#define LLVM_DEBUG(msg) msg
+#else
+#define LLVM_DEBUG(msg)
+#endif
+
+#define NEURON_MEMORY (0)
+#define WEIGHT_MEMORY (1)
+
+// declare in DTCM for preventing xmalloc/free
+static cvk_ml_t matrix_lmem[5];
+#define DEBUG (0)
+#define DBG(fmt, ...)                             \
+  do {                                            \
+    if (DEBUG) fprintf(stderr, fmt, __VA_ARGS__); \
+  } while (0)
+
+// gemm used
+uint32_t lmem_ptr = 0;  // FIXME: move to kernel
+cvk_ml_t *bmk1880v2_matrix_lmem_prealloc_align(cvk_context_t *ctx, cvk_ml_t *pre, uint32_t la,
+                                               cvk_ml_shape_t s, cvk_fmt_t fmt, int eu_align) {
+  uint32_t lmem_size = ctx->info.lmem_size;
+  uint32_t npu_num = ctx->info.npu_num;
+  uint32_t eu_num = ctx->info.eu_num;
+  uint32_t val = (fmt == CVK_FMT_BF16) ? 2 : 1;
+
+  cvk_ml_t *t;
+  if (pre) {
+    t = pre;
+  } else {
+    t = xmalloc(sizeof(*t));
+  }
+
+  t->start_address = la;
+  t->fmt = fmt;
+  t->shape = s;
+  t->stride.h = s.w * val;
+  if (eu_align)
+    t->stride.c = align_up(s.w * val, eu_num);
+  else
+    t->stride.c = s.w * val;
+  t->stride.n = t->stride.c * ceiling_func(s.c, npu_num);
+
+  uint32_t needed = align_up(t->shape.n * t->stride.n, eu_num);
+
+  if (lmem_size - lmem_ptr < needed) {
+    if (!pre) {
+      free(t);
+    }
+    ASSERT(0 && "not enough local memory alloc");
+    return NULL;
+  }
+  // ctx->lmem_ptr += needed;
+  lmem_ptr = la + 1;
+  return t;
+}
+
+void bmk1880v2_lmem_free_prealloc_bf16_matrix(cvk_context_t *ctx, bool is_pre_alloc,
+                                              const cvk_ml_t *t) {
+  // printf("free from %d, lmem_ptr is %d\n", t->start_address, ctx->lmem_ptr);
+  (void)ctx;
+  ASSERT(t->start_address < lmem_ptr);
+  lmem_ptr = t->start_address;
+  if (!is_pre_alloc) {
+    free((void *)t);
+  }
+}
+
+static void tdma_store_stride_bf16(cvk_context_t *ctx, cvk_ml_t *tlp, uint64_t ga_dst,
+                                   cvk_mg_stride_t ts_stride, ctrl_t ctrl) {
+  bool DoTranspose = (ctrl & CTRL_TP) ? true : false;
+  bool isNeuron = (ctrl & CTRL_NEURON) ? true : false;
+
+  ASSERT(DoTranspose == false);
+  (void)DoTranspose;
+
+  // tensor in system memory
+  // Global shape use local shape
+  // Global shape used for stride calculation
+  cvk_mg_t ts_data;
+  ts_data.base_reg_index = isNeuron ? NEURON_MEMORY : WEIGHT_MEMORY;
+  ts_data.start_address = ga_dst;
+  ts_data.fmt = tlp->fmt;
+  ts_data.shape.row = tlp->shape.n;
+  ts_data.shape.col = tlp->shape.col;
+  ts_data.stride = ts_stride;
+
+  cvk_tdma_l2g_matrix_copy_param_t p1;
+  p1.src = tlp;
+  p1.dst = &ts_data;
+  ctx->ops->tdma_l2g_bf16_matrix_copy(ctx, &p1);
+}
+
+static void tdma_load_stride_bf16(cvk_context_t *ctx, cvk_ml_t *tlp, uint64_t ga_src,
+                                  cvk_mg_stride_t ts_stride, ctrl_t ctrl) {
+  ASSERT(tlp != NULL);
+
+  bool DoTranspose = (ctrl & CTRL_TP) ? true : false;
+  bool isNeuron = (ctrl & CTRL_NEURON) ? true : false;
+  (void)DoTranspose;
+
+  // Global memory from reshaped local memory
+  cvk_mg_t ts_data;
+  ts_data.base_reg_index = isNeuron ? NEURON_MEMORY : WEIGHT_MEMORY;
+  ts_data.start_address = ga_src;
+  ts_data.fmt = tlp->fmt;
+  ts_data.shape.row = tlp->shape.n;
+  ts_data.shape.col = tlp->shape.col;
+  ts_data.stride = ts_stride;
+
+  // BM1880v2 tdma does not support transposed matrix load
+  ASSERT(!DoTranspose);
+
+  cvk_tdma_g2l_matrix_copy_param_t p1;
+  p1.src = &ts_data;
+  p1.dst = tlp;
+  ctx->ops->tdma_g2l_bf16_matrix_copy(ctx, &p1);
+}
+//
+// Shape/stride used in TDMA may not the same as in TIU.
+// Adjust shape/stride for TIU.
+//
+// E.g.
+//   Y(0, 4) = L(1, 256) * R(256, 4) + B(1, 4)
+//
+//   TDMA:
+//      L(0, 16, 1, 16)
+//      R(255, 1, 1, 4)
+//      B(0, 1, 1, 4)
+//
+//   TIU:
+//       Y res0(1, 1, 1, 16)
+//       L opd0(1, 16, 1, 16)
+//       R opd1(256, 1, 1, 16)
+//       B opd2(1, 1, 1, 16)
+//
+static void matrix_multiplication(cvk_context_t *ctx, cvk_tiu_matrix_multiplication_param_t *p) {
+  // No need to adjust shape/stride
+  if (p->res->shape.w >= ctx->info.eu_num) {
+    // LLVM_DEBUG(printf("    L(%d, %d), R(%d, %d)\n", p->left->shape.n,
+    //                                       p->left->shape.col, p->right->shape.n,
+    //                                       p->right->shape.col););
+    ctx->ops->tiu_matrix_multiplication(ctx, p);
+
+    return;
+  }
+
+  //
+  // New shape/stride to align ctx->info.eu_num
+  // adjust w as ctx->info.eu_num
+  //
+  cvk_ml_t tl_res;
+  tl_res.start_address = p->res->start_address;
+  tl_res.fmt = p->res->fmt;
+  tl_res.shape.n = p->res->shape.n;
+  tl_res.shape.c = p->res->shape.c;
+  tl_res.shape.w = (uint32_t)(ctx->info.eu_num);
+  tl_res.shape.col = p->res->shape.col;
+  tl_res.stride = ctx->ops->ml_default_stride(ctx, tl_res.shape, CVK_FMT_BF16, /*eu_align=*/1);
+
+  cvk_ml_t tl_right;
+  tl_right.start_address = p->right->start_address;
+  tl_right.fmt = p->right->fmt;
+  tl_right.shape.n = p->right->shape.n;
+  tl_right.shape.c = p->right->shape.c;
+  tl_right.shape.w = (uint32_t)(ctx->info.eu_num);
+  tl_right.shape.col = p->right->shape.col;
+  tl_right.stride = ctx->ops->ml_default_stride(ctx, tl_right.shape, CVK_FMT_BF16, /*eu_align=*/1);
+
+  cvk_ml_t tl_bias = {0};
+  if (p->bias) {
+    tl_bias.start_address = p->bias->start_address;
+    tl_bias.fmt = p->bias->fmt;
+    tl_bias.shape.n = p->bias->shape.n;
+    tl_bias.shape.c = p->bias->shape.c;
+    tl_bias.shape.w = (uint32_t)(ctx->info.eu_num);
+    tl_bias.shape.col = p->bias->shape.col;
+    tl_bias.stride = ctx->ops->ml_default_stride(ctx, tl_bias.shape, CVK_FMT_BF16, /*eu_align=*/1);
+  }
+
+  cvk_tiu_matrix_multiplication_param_t p2;
+  // copy p to p2
+  p2.res = p->res;
+  p2.left = p->left;
+  p2.right = p->right;
+  p2.bias = p->bias;
+  p2.lshift_bits = p->lshift_bits;
+  p2.rshift_bits = p->rshift_bits;
+  p2.res_is_int8 = p->res_is_int8;
+  p2.add_result = p->add_result;
+  p2.relu_enable = p->relu_enable;
+  p2.ps32_mode = p->ps32_mode;
+  p2.res_is_int8 = p->res_is_int8;
+
+  p2.layer_id = p->layer_id;
+  // p2.sw_op_info = p->sw_op_info;
+
+  p2.res = &tl_res;
+  p2.left = p->left;
+  p2.right = &tl_right;
+  p2.bias = p->bias ? &tl_bias : NULL;
+
+  LLVM_DEBUG(printf("    Modified L(%d, %d), R(%d, %d)\n", p2.left->shape.n, p2.left->shape.col,
+                    p2.right->shape.n, p2.right->shape.col););
+
+  ctx->ops->tiu_matrix_multiplication(ctx, &p2);
+}
+
+static void fc_slicing_multi_dimention(cvk_context_t *ctx, uint32_t layer_id,
+                                       gaddr_t global_offset_bottom_data,
+                                       gaddr_t global_offset_weight_data,
+                                       gaddr_t global_offset_bias_data,
+                                       gaddr_t global_offset_top_data, int input_row_num,
+                                       int input_col_num, int weight_col_num, int have_bias,
+                                       int do_activation, int activation_method) {
+  // Y(M, K) = L(M, K) * R(K, N) + B(1, N)
+  uint32_t M = (uint32_t)(input_row_num);
+  uint32_t K = (uint32_t)(input_col_num);
+  uint32_t N = (uint32_t)(weight_col_num);
+
+  LLVM_DEBUG(printf("fc_slicing_multi_dimension\n"
+                    "  Y(%d, %d) = L(%d, %d) * R(%d, %d) + B(%d, %d)\n",
+                    M, N, M, K, K, N, 1, N););
+
+  // Split N <= max total eu number
+  uint32_t total_eu = ctx->info.npu_num * ctx->info.eu_num;
+  uint32_t tiled_N = (N >= total_eu) ? total_eu : N;
+
+  // Split K based on lane size
+  uint32_t lane_size = ctx->info.lmem_size;
+  uint32_t max_k = (1 << 12) - 1;  // 1880v2: 12 bit
+  uint32_t tiled_K = (K >= max_k) ? max_k : K;
+
+  // Tiled Y
+  cvk_ml_t tl_tiled_Y = {0};
+  tl_tiled_Y.fmt = CVK_FMT_BF16;
+
+  // Tiled L
+  cvk_ml_t tl_tiled_L = {0};
+  tl_tiled_L.fmt = CVK_FMT_BF16;
+
+  // Tiled R
+  cvk_ml_t tl_tiled_R = {0};
+  tl_tiled_R.fmt = CVK_FMT_BF16;
+
+  // Tiled B
+  cvk_ml_t tl_tiled_B = {0};
+  if (have_bias) {
+    // ctx->ops->tiu_matrix_multiplication will change shape.n from 2 to 1
+    // So we use the shape for both dma load and local memory allocation.
+
+    // Upper16 [31:16] then Lower16 [15:0] separated by b_stride
+    tl_tiled_B.fmt = CVK_FMT_BF16;
+    tl_tiled_B.shape = ctx->ops->ml_default_shape(ctx, sizeof(uint32_t) / sizeof(uint16_t), tiled_N,
+                                                  CVK_FMT_BF16);  // 2 x 16bit
+    tl_tiled_B.stride =
+        ctx->ops->ml_default_stride(ctx, tl_tiled_B.shape, CVK_FMT_BF16, /*eu_align=*/1);
+  }
+
+  // Tiled local memory layout:
+  //   Y at fixed position since last tiled ones may be smaller
+  //
+  //   tiled Y, [7:0]
+  //   tiled Y, [15:8]
+  //   tiled Y, [23:16]
+  //   tiled Y, [31:24]
+  //   tiled L  [15:0]
+  //   tiled R  [15:0]
+  //   tiled B, [31:16], if existed
+  //   tiled B, [15:0], if existed
+
+  // Find max tiled K
+  uint32_t required_size = 0;
+  do {
+    required_size = 0;  // Start of LMEM
+
+    // Not split M since we don't want to reload L(weight)
+    // or reload partial result of different M.
+    //
+    // Y(M, N) = L(M, K) * R(K, N) + B(1, N)
+    // tiled_Y(M, tiled_N) = tiled_L(M, tiled_K) * tiled_R(tiled_K, tiled_N) + tiled_B(1, tiled_N)
+
+    // tiled Y, 2 * 16bit
+    tl_tiled_Y.start_address = required_size;
+    tl_tiled_Y.shape = ctx->ops->ml_default_shape(ctx, M, tiled_N, CVK_FMT_BF16);
+    tl_tiled_Y.stride =
+        ctx->ops->ml_default_stride(ctx, tl_tiled_Y.shape, CVK_FMT_BF16, /*eu_align=*/1);
+    required_size += ctx->ops->lmem_ps32_matrix_to_size(ctx, tl_tiled_Y.shape, CVK_FMT_BF16,
+                                                        /*eu_align=*/1);
+
+    // tiled L, 16bit
+    tl_tiled_L.start_address = required_size;
+    tl_tiled_L.shape = ctx->ops->ml_default_shape(ctx, M, tiled_K, CVK_FMT_BF16);
+    tl_tiled_L.stride =
+        ctx->ops->ml_default_stride(ctx, tl_tiled_L.shape, CVK_FMT_BF16, /*eu_align=*/1);
+    required_size +=
+        ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_L.shape, CVK_FMT_BF16, /*eu_align=*/1);
+
+    // tiled R, 16bit
+    tl_tiled_R.start_address = required_size;
+    tl_tiled_R.shape = ctx->ops->ml_default_shape(ctx, tiled_K, tiled_N, CVK_FMT_BF16);
+    tl_tiled_R.stride =
+        ctx->ops->ml_default_stride(ctx, tl_tiled_R.shape, CVK_FMT_BF16, /*eu_align=*/1);
+    required_size +=
+        ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_R.shape, CVK_FMT_BF16, /*eu_align=*/1);
+
+    // tiled B, 2 * 16bit
+    if (have_bias) {
+      tl_tiled_B.start_address = required_size;
+      required_size +=
+          ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_B.shape, CVK_FMT_BF16, /*eu_align=*/1);
+    }
+
+    if (required_size <= lane_size) {
+      // LLVM_DEBUG(printf("  tiled_Y %d, tiled_L %d, tiled_R %d, tiled_B %d, required_size %d\n",
+      //                              ctx->ops->lmem_ps32_matrix_to_size(ctx, tl_tiled_Y.shape,
+      //                              CVK_FMT_BF16, /*eu_align=*/1),
+      //                              ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_L.shape,
+      //                              CVK_FMT_BF16, /*eu_align=*/1),
+      //                              ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_R.shape,
+      //                              CVK_FMT_BF16, /*eu_align=*/1),
+      //                              ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_B.shape,
+      //                              CVK_FMT_BF16, /*eu_align=*/1), required_size););
+
+      break;
+    }
+
+  } while (--tiled_K);
+
+  LLVM_DEBUG(printf("  tiled_Y(%d, %d) = tiled_L(%d, %d) * tiled_R(%d, %d) + tiled_B(%d, %d),"
+                    " required_size %d kB\n",
+                    M, tiled_N, M, tiled_K, tiled_K, tiled_N, 1, tiled_N, required_size / 1024););
+
+  LLVM_DEBUG(
+      printf("  tiled_Y shape (n=%d, c=%d, w=%d, col=%d), stride(n=%d, c=%d, h=%d)\n"
+             "  tiled_L shape (n=%d, c=%d, w=%d, col=%d), stride(n=%d, c=%d, h=%d)\n"
+             "  tiled_R shape (n=%d, c=%d, w=%d, col=%d), stride(n=%d, c=%d, h=%d)\n"
+             "  tiled_B shape (n=%d, c=%d, w=%d, col=%d), stride(n=%d, c=%d, h=%d)\n",
+             tl_tiled_Y.shape.n, tl_tiled_Y.shape.c, tl_tiled_Y.shape.w, tl_tiled_Y.shape.col,
+             tl_tiled_Y.stride.n, tl_tiled_Y.stride.c, tl_tiled_Y.stride.h, tl_tiled_L.shape.n,
+             tl_tiled_L.shape.c, tl_tiled_L.shape.w, tl_tiled_L.shape.col, tl_tiled_L.stride.n,
+             tl_tiled_L.stride.c, tl_tiled_L.stride.h, tl_tiled_R.shape.n, tl_tiled_R.shape.c,
+             tl_tiled_R.shape.w, tl_tiled_R.shape.col, tl_tiled_R.stride.n, tl_tiled_R.stride.c,
+             tl_tiled_R.stride.h, tl_tiled_B.shape.n, tl_tiled_B.shape.c, tl_tiled_B.shape.w,
+             tl_tiled_B.shape.col, tl_tiled_B.stride.n, tl_tiled_B.stride.c, tl_tiled_B.stride.h););
+
+  ASSERT(tiled_K);
+  if (!tiled_K) {
+    return;
+  }
+
+  // Each tiled_R(weight) is only loaded once.
+  // tiled_L(input) reload is reload once tiled_weight moves right.
+  //
+  // for each tiled N
+  for (uint32_t offset_N = 0; offset_N < N; offset_N += tiled_N) {
+    // Y = [Y0, Y1, ... Yn-1]
+
+    // Actual width
+    uint32_t width_N = ((offset_N + tiled_N) <= N) ? tiled_N : (N - offset_N);
+
+    // for each tiled K
+    for (uint32_t offset_K = 0; offset_K < K; offset_K += tiled_K) {
+      // Y(M, K) = L(M, K) * R(K, N) + B(1, N)
+      // tiled_Y(M, tiled_K) = tiled_L(M, tiled_K) * tiled_R(tiled_K, tiled_N) + tiled_B(1, tiled_N)
+      //
+      // L = [L0, L1, ... Lk-1]
+      // R = [R0,0,   R0,1,   ..., R0,n-1
+      //      R1,0,
+      //
+      //      Rk-1,0, Rk-1,1, ..., Rk-1,n-1]
+      // B = [B0, B1, ... Bn-1]
+      //
+      // tiled_y,i += L0 * R0,i + L1 * R1,i + ... + Ln-1 * Rk-1,i + Bi
+
+      // Actual width
+      uint32_t width_K = ((offset_K + tiled_K) <= K) ? tiled_K : (K - offset_K);
+
+      required_size = 0;  // Start of LMEM
+
+      // tiled Y, 32bit
+      tl_tiled_Y.start_address = required_size;
+      tl_tiled_Y.shape = ctx->ops->ml_default_shape(ctx, M, width_N, CVK_FMT_BF16);  // actual width
+      required_size += ctx->ops->lmem_ps32_matrix_to_size(ctx, tl_tiled_Y.shape, CVK_FMT_BF16,
+                                                          /*eu_align=*/1);
+
+      // Load tiled L from global memory, input
+      tl_tiled_L.start_address = required_size;
+      tl_tiled_L.shape = ctx->ops->ml_default_shape(ctx, M, width_K, CVK_FMT_BF16);  // actual width
+      tl_tiled_L.stride = ctx->ops->ml_default_stride(ctx, tl_tiled_L.shape, CVK_FMT_BF16,
+                                                      /*eu_align=*/1);
+      required_size +=
+          ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_L.shape, CVK_FMT_BF16, /*eu_align=*/1);
+      cvk_mg_stride_t ts_stride;
+      ts_stride.row = K * sizeof(uint16_t);
+      tdma_load_stride_bf16(ctx, &tl_tiled_L,
+                            global_offset_bottom_data + offset_K * sizeof(uint16_t),
+                            ts_stride,  // original column width
+                            CTRL_NEURON);
+
+      // Load tiled R from global memory, weight
+      tl_tiled_R.start_address = required_size;
+      tl_tiled_R.shape =
+          ctx->ops->ml_default_shape(ctx, width_K, width_N, CVK_FMT_BF16);  // actual width
+      tl_tiled_R.stride = ctx->ops->ml_default_stride(ctx, tl_tiled_R.shape, CVK_FMT_BF16,
+                                                      /*eu_align=*/1);
+      required_size +=
+          ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_R.shape, CVK_FMT_BF16, /*eu_aligned=*/1);
+
+      ts_stride.row = N * sizeof(uint16_t);
+      tdma_load_stride_bf16(
+          ctx, &tl_tiled_R,
+          global_offset_weight_data + (offset_K * N + offset_N) * sizeof(uint16_t),
+          ts_stride,  // original column width
+          CTRL_NEURON);
+
+      // Load tiled B(bias) from gobale memory at last time as H/W does
+      // we need temporary shape to load uppper 16bit and lower 16bit
+      bool is_last_tile = ((offset_K + tiled_K) >= K) ? true : false;
+      bool B_needed = (is_last_tile && have_bias) ? true : false;
+      if (B_needed) {
+        tl_tiled_B.start_address = required_size;
+
+        tl_tiled_B.shape =
+            ctx->ops->ml_default_shape(ctx, sizeof(uint32_t) / sizeof(uint16_t), width_N,
+                                       CVK_FMT_BF16);  // 2 x 16bit, actual width
+        tl_tiled_B.stride =
+            ctx->ops->ml_default_stride(ctx, tl_tiled_B.shape, CVK_FMT_BF16, /*eu_align=*/1);
+        required_size += ctx->ops->lmem_matrix_to_size(ctx, tl_tiled_B.shape, CVK_FMT_BF16,
+                                                       /*eu_aligned=*/1);
+        ASSERT(required_size <= lane_size);
+
+        ts_stride.row = N * sizeof(uint16_t);
+        tdma_load_stride_bf16(ctx, &tl_tiled_B,
+                              global_offset_bias_data + offset_N * sizeof(uint16_t),
+                              ts_stride,  // original column width
+                              CTRL_NEURON);
+      }
+
+      uint32_t ps32_mode = 0;    // normal mode
+      uint32_t relu_enable = 0;  // 1880v2 relu can be used in ps32_mode
+      if (tiled_K < K) {
+        if (offset_K == 0) {        // first tile
+          ps32_mode = 2;            // write 32b result at the first time
+        } else if (is_last_tile) {  // last tile
+          ps32_mode = 1;            // load previous 32-bit result
+        } else {
+          ps32_mode = 3;  // init & write 32bits partial sum
+        }
+      }
+
+      // No tiling or last tile
+      if ((ps32_mode == 0 || ps32_mode == 1) && do_activation && activation_method == RELU) {
+        relu_enable = 1;
+      }
+
+      {
+        cvk_tiu_matrix_multiplication_param_t p;
+        p.res = &tl_tiled_Y;
+        p.left = &tl_tiled_L;
+        p.right = &tl_tiled_R;
+        p.bias = B_needed ? &tl_tiled_B : NULL;
+        p.lshift_bits = 0;  // deprecated
+        p.rshift_bits = 0;
+        p.res_is_int8 = 1;  // H/W constraint
+        p.add_result = 0;   // H/W constraint
+        p.relu_enable = relu_enable;
+        p.ps32_mode = ps32_mode;
+        p.res_is_int8 = 1;
+
+        p.layer_id = layer_id;
+        // p.sw_op_info = offset_N;
+
+        LLVM_DEBUG(printf("  [offset_N=%d][offset_K=%d] L(%d, %d), R(%d, %d)\n", offset_N, offset_K,
+                          p.left->shape.n, p.left->shape.col, p.right->shape.n,
+                          p.right->shape.col););
+
+        matrix_multiplication(ctx, &p);
+      }
+
+      // Store tiled_Y to global memory
+      if (is_last_tile) {
+        ts_stride.row = N * sizeof(uint16_t);
+        tdma_store_stride_bf16(ctx, &tl_tiled_Y,
+                               global_offset_top_data + offset_N * sizeof(uint16_t),
+                               ts_stride,  // original column width
+                               CTRL_NEURON);
+      }
+
+    }  // for (uint32_t offset_K = 0; offset_K < K; offset_K += tiled_K)
+
+  }  // for (uint32_t offset_N = 0; offset_N < N; offset_N += tiled_N)
+}
+
+void cvm_fc_forward_kernel(cvk_context_t *ctx, uint32_t layer_id, gaddr_t bottom_data_gaddr,
+                           gaddr_t weight_data_gaddr, gaddr_t bias_data_gaddr,
+                           gaddr_t top_data_gaddr, int in_row, int in_col, int out_col,
+                           int have_bias, int do_activation, int activation_method) {
+  // LLVM_DEBUG(
+  //    printf("bf16_fc_forward_kernel\n"
+  //           "    bottom_gaddr 0x%lx, weight_gaddr 0x%lx, bias_gaddr 0x%lx, top_gaddr 0x%lx\n"
+  //           "    in (%d, %d), out (%d)\n"
+  //           "    has_bias %d, do_activation %d, activation_method %d\n",
+  //           bottom_data_gaddr, weight_data_gaddr, bias_data_gaddr, top_data_gaddr, in_row,
+  //           in_col, out_col, have_bias, do_activation, activation_method););
+
+  fc_slicing_multi_dimention(ctx, layer_id, bottom_data_gaddr, weight_data_gaddr, bias_data_gaddr,
+                             top_data_gaddr, in_row, in_col, out_col, have_bias, do_activation,
+                             activation_method);
+}
+
+// gemm
+inline static size_t get_neuron_csize_local(cvk_context_t *ctx, size_t h, size_t w, cvk_fmt_t fmt) {
+  size_t size = h * w * bitsize_of_fmt(fmt) / 8;
+  // ctx->info.eu_num neurons align
+  return ALIGN(size, ctx->info.eu_num);
+}
+
+static int get_fmt_byte_sz(cvk_fmt_t fmt) { return bitsize_of_fmt(fmt) / 8; }
+
+static uint64_t get_slice_global_offset(uint64_t global_offset, size_t row_slice_idx,
+                                        size_t col_slice_idx, size_t row_num, size_t col_num,
+                                        size_t row_slice_num, size_t col_slice_num, cvk_fmt_t fmt) {
+  int fmt_byte_sz = get_fmt_byte_sz(fmt);
+  uint64_t slice_offset_row = 0;
+  if (row_slice_idx < (row_num % row_slice_num)) {
+    slice_offset_row = row_slice_idx * (row_num / row_slice_num + 1);
+  } else {
+    slice_offset_row = (row_num % row_slice_num) * (row_num / row_slice_num + 1) +
+                       (row_slice_idx - (row_num % row_slice_num)) * (row_num / row_slice_num);
+  }
+
+  uint64_t slice_offset_col = 0;
+  if (col_slice_idx < (col_num % col_slice_num)) {
+    slice_offset_col = col_slice_idx * (col_num / col_slice_num + 1);
+  } else {
+    slice_offset_col = (col_num % col_slice_num) * (col_num / col_slice_num + 1) +
+                       (col_slice_idx - (col_num % col_slice_num)) * (col_num / col_slice_num);
+  }
+
+  uint64_t slice_offset;
+  slice_offset = (slice_offset_col + slice_offset_row * col_num) * fmt_byte_sz;
+  return (global_offset + slice_offset);
+}
+
+#define LOCAL_MEM_BANKS (ctx->info.lmem_banks)
+#define NPU_SHIFT (get_num_shift(ctx->info.npu_num))
+#define MAX(a, b) (((a) > (b)) ? (a) : (b))
+//#define SMALL_TEST (1)
+static size_t get_slice_num(cvk_context_t *ctx, size_t M, size_t N, size_t K, size_t *slice_num,
+                            cvk_fmt_t fmt) {
+  int fmt_byte_sz = get_fmt_byte_sz(fmt);
+#ifdef SMALL_TEST
+  size_t bank_size = (2048 / LOCAL_MEM_BANKS / fmt_byte_sz);
+#else  /* ! ifdef SMALL_TEST */
+  size_t bank_size = (ctx->info.lmem_size / LOCAL_MEM_BANKS / fmt_byte_sz);
+#endif /* SMALL_TEST */
+  slice_num[0] = slice_num[1] = slice_num[2] = 1;
+
+  size_t W_param = ctx->info.eu_num;
+  size_t csize_local = get_neuron_csize_local(ctx, 1, W_param, fmt);
+  size_t C_param = (K + W_param - 1) / W_param;
+  size_t size_A = M * ceiling_func_shift(C_param, NPU_SHIFT) * csize_local;
+  C_param = (N + W_param - 1) / W_param;
+  size_t size_B = K * ceiling_func_shift(C_param, NPU_SHIFT) * csize_local;
+  int res_byte_sz = 1;
+  if (fmt != CVK_FMT_BF16) {
+    // partial sum for 32bit output
+    res_byte_sz = sizeof(int);
+  }
+
+  size_t size_C = res_byte_sz * M * ceiling_func_shift(C_param, NPU_SHIFT) * csize_local;
+
+  DBG(" A size: %zu, B size: %zu, C size: %zu, bank: %zu\n", size_A, size_B, size_C, bank_size);
+
+  if (size_A <= bank_size && size_B <= bank_size && size_C <= bank_size) {
+    return 0;
+  } else if (size_B <= bank_size) {
+    slice_num[0] = MAX(ceiling_func(size_A, bank_size), ceiling_func(size_C, bank_size));
+    // split C local memory size
+    size_t slice_size = ceiling_func(M, slice_num[0]);
+    C_param = (N + ctx->info.eu_num - 1) / ctx->info.eu_num;
+    size_t slice_mem_C = slice_size * ceiling_func_shift(C_param, NPU_SHIFT) * csize_local;
+    if (slice_mem_C > bank_size) return 3;
+    return 1;
+  } else if (size_A <= bank_size) {
+    slice_num[1] = MAX(ceiling_func(size_B, bank_size), ceiling_func(size_C, bank_size));
+    // Split more times in N, use load_stride maybe override previous data.
+    if (slice_num[1] > 1) {
+      int N_silce = N / slice_num[1];
+      int N_floor = (N_silce / ctx->info.eu_num) * ctx->info.eu_num;
+      if (N_floor == 0) return 3;
+      slice_num[1] = ceiling_func(N, N_floor);
+    }
+    //
+    if (ceiling_func(ceiling_func(N, ctx->info.eu_num), NPU_SHIFT) < (int)slice_num[1]) return 3;
+    return 2;
+  } else {
+    return 3;
+  }
+}
+
+static inline size_t get_max(size_t a, size_t b) { return a > b ? a : b; }
+
+static int load_matrix(cvk_context_t *ctx, cvk_ml_t *tlp, uint64_t gaddr_in, cvk_mg_shape_t shape,
+                       cvk_mg_stride_t stride, cvk_fmt_t fmt) {
+  // load matrix data
+  cvk_mg_t ts_data;
+  ts_data.base_reg_index = 0;
+  ts_data.start_address = gaddr_in;
+  ts_data.stride = stride;
+  ts_data.fmt = fmt;
+  ts_data.shape = shape;
+  cvk_tdma_g2l_matrix_copy_param_t p1;
+  p1.src = &ts_data;
+  p1.dst = tlp;
+  if (fmt == CVK_FMT_BF16) {
+    ctx->ops->tdma_g2l_bf16_matrix_copy(ctx, &p1);
+  } else {
+    ctx->ops->tdma_g2l_matrix_copy(ctx, &p1);
+  }
+
+  return 0;
+}
+
+static int store_matrix(cvk_context_t *ctx, cvk_ml_t *tlp, uint64_t gaddr_in, cvk_mg_shape_t shape,
+                        cvk_mg_stride_t stride, cvk_fmt_t fmt) {
+  cvk_mg_t ts_data;
+  ts_data.base_reg_index = 0;
+  ts_data.start_address = gaddr_in;
+  ts_data.fmt = fmt;
+  ts_data.shape = shape;
+  ts_data.stride = stride;
+  cvk_tdma_l2g_matrix_copy_param_t p3;
+  p3.src = tlp;
+  p3.dst = &ts_data;
+  if (fmt == CVK_FMT_BF16) {
+    ctx->ops->tdma_l2g_bf16_matrix_copy(ctx, &p3);
+  } else {
+    ctx->ops->tdma_l2g_matrix_copy(ctx, &p3);
+  }
+
+  return 0;
+}
+
+static int layerid = 0;
+static void _matrix_multiplication(cvk_context_t *ctx, cvk_ml_t *tlp_a, cvk_ml_t *tlp_b,
+                                   cvk_ml_t *tlp_c, int ps32_mode) {
+  // mac
+  cvk_tiu_matrix_multiplication_param_t p2;
+  p2.bias = NULL;
+  p2.left = tlp_a;
+  p2.right = tlp_b;
+  p2.res = tlp_c;
+  p2.lshift_bits = 0;
+  p2.rshift_bits = 0;
+  if (tlp_c->fmt == CVK_FMT_BF16) {
+    p2.res_is_int8 = true;
+  } else {
+    // for int
+    p2.res_is_int8 = false;
+  }
+  p2.relu_enable = 0;
+  p2.add_result = 0; /*bf16 HW does not support add_result*/
+  p2.ps32_mode = ps32_mode;
+
+  p2.layer_id = layerid;
+  layerid++;
+  ctx->ops->tiu_matrix_multiplication(ctx, &p2);
+}
+
+static void strategy_no_slice(cvk_context_t *ctx, size_t M, size_t N, size_t K, uint64_t gaddr_a,
+                              uint64_t gaddr_b, uint64_t gaddr_c, cvk_fmt_t fmt) {
+  cvk_ml_t *tlp_a;
+  cvk_ml_t *tlp_b;
+  cvk_ml_t *tlp_c;
+  // size_t bank_size = ctx->info.lmem_size / LOCAL_MEM_BANKS * 2;
+  int fmt_byte_sz = get_fmt_byte_sz(fmt);
+  // size_t bank_size = ctx->info.lmem_size / LOCAL_MEM_BANKS;
+  int psmode = 0;  // default for bf16
+
+  cvk_ml_shape_t shape_a = ctx->ops->ml_default_shape(ctx, M, K, fmt);
+  cvk_ml_shape_t shape_b = ctx->ops->ml_default_shape(ctx, K, N, fmt);
+  cvk_ml_shape_t shape_c = ctx->ops->ml_default_shape(ctx, M, N, fmt);
+
+  tlp_a = ctx->ops->lmem_alloc_matrix(ctx, shape_a, fmt, CTRL_AL);
+  tlp_b = ctx->ops->lmem_alloc_matrix(ctx, shape_b, fmt, CTRL_AL);
+  if (fmt == CVK_FMT_BF16) {
+    tlp_c = ctx->ops->lmem_alloc_matrix(ctx, shape_c, fmt, CTRL_AL);
+  } else {
+    shape_c.n = shape_a.n;
+    shape_c.c = shape_b.c;
+    shape_c.w = shape_b.w;
+    shape_c.col = shape_b.col;
+
+    tlp_c = ctx->ops->lmem_alloc_ps32_matrix(ctx, shape_c, fmt, CTRL_AL);
+    psmode = 2;
+  }
+
+  cvk_mg_shape_t shape;
+  shape.row = tlp_a->shape.n;
+  shape.col = tlp_a->shape.col;
+  cvk_mg_stride_t stride;
+  stride.row = (uint32_t)K * fmt_byte_sz;
+  load_matrix(ctx, tlp_a, gaddr_a, shape, stride, fmt);
+
+  shape.row = tlp_b->shape.n;
+  shape.col = tlp_b->shape.col;
+  stride.row = (uint32_t)N * fmt_byte_sz;
+  load_matrix(ctx, tlp_b, gaddr_b, shape, stride, fmt);
+
+  // mac
+  _matrix_multiplication(ctx, tlp_a, tlp_b, tlp_c, psmode);
+
+  if (fmt != CVK_FMT_BF16) {
+    tlp_c->shape.n *= sizeof(int);  // partial sum for 32bit output
+  }
+  shape.row = tlp_c->shape.n;
+  shape.col = tlp_c->shape.col;
+  stride.row = (uint32_t)N * fmt_byte_sz;
+  store_matrix(ctx, tlp_c, gaddr_c, shape, stride, fmt);
+
+  ctx->ops->lmem_free_matrix(ctx, tlp_c);
+  ctx->ops->lmem_free_matrix(ctx, tlp_b);
+  ctx->ops->lmem_free_matrix(ctx, tlp_a);
+}
+
+static void strategy_slice_on_M(cvk_context_t *ctx, size_t M, size_t N, size_t K, uint64_t gaddr_a,
+                                uint64_t gaddr_b, uint64_t gaddr_c, size_t slice_num,
+                                cvk_fmt_t fmt) {
+  cvk_ml_t *tlp_b;
+  int fmt_byte_sz = get_fmt_byte_sz(fmt);
+
+  cvk_ml_shape_t s_B = ctx->ops->ml_default_shape(ctx, K, N, fmt);
+
+  tlp_b = ctx->ops->lmem_alloc_matrix(ctx, s_B, fmt, CTRL_AL);
+
+  cvk_mg_shape_t shape;
+  shape.row = tlp_b->shape.n;
+  shape.col = tlp_b->shape.col;
+  cvk_mg_stride_t stride;
+  stride.row = (uint32_t)N * fmt_byte_sz;
+  load_matrix(ctx, tlp_b, gaddr_b, shape, stride, fmt);
+
+  int pack_shift = 0;
+  int psmode = 0;  // default for bf16
+  for (size_t slice_idx = 0; slice_idx < slice_num; slice_idx++) {
+    cvk_ml_t *tlp_a;
+    size_t M_slice = M / slice_num + (slice_idx < M % slice_num);
+    cvk_ml_shape_t s_A = ctx->ops->ml_default_shape(ctx, M_slice, K, fmt);
+    tlp_a = ctx->ops->lmem_alloc_matrix(ctx, s_A, fmt, CTRL_AL);
+
+    uint64_t A_slice_global_offset =
+        get_slice_global_offset(gaddr_a, slice_idx, 0, M, K, slice_num, 1, fmt);
+
+    cvk_mg_shape_t shape;
+    shape.row = tlp_a->shape.n;
+    shape.col = tlp_a->shape.col;
+    cvk_mg_stride_t st_A;
+    st_A.row = (uint32_t)K * fmt_byte_sz;
+    load_matrix(ctx, tlp_a, A_slice_global_offset, shape, st_A, fmt);
+
+    tlp_a->shape.n = M_slice;
+    tlp_a->shape.col = K;
+
+    cvk_ml_shape_t s_C = ctx->ops->ml_default_shape(ctx, M_slice, N, fmt);
+    cvk_ml_t *tlp_c;
+
+    if (fmt == CVK_FMT_BF16) {
+      tlp_c = ctx->ops->lmem_alloc_matrix(ctx, s_C, fmt, CTRL_AL);
+    } else {
+      s_C.n = tlp_a->shape.n;
+      s_C.c = tlp_b->shape.c;
+      s_C.w = tlp_b->shape.w;
+      s_C.col = tlp_b->shape.col;
+
+      tlp_c = ctx->ops->lmem_alloc_ps32_matrix(ctx, s_C, fmt, CTRL_AL);
+      psmode = 2;
+    }
+
+    uint64_t C_slice_global_offset =
+        get_slice_global_offset(gaddr_c, slice_idx, 0, M, N, slice_num, 1, fmt);
+
+    if (fmt != CVK_FMT_BF16) {
+      // int32 pack
+      C_slice_global_offset = get_slice_global_offset(gaddr_c, 0, 0, M, N, slice_num, 1, fmt);
+      C_slice_global_offset += pack_shift;
+      pack_shift += (tlp_c->shape.n * tlp_c->shape.col * sizeof(int));
+      // C_slice_global_offset =
+      //  get_slice_global_offset(gaddr_c, slice_idx, 0, M * 2, N * 2, slice_num, 1, fmt);
+    }
+
+    _matrix_multiplication(ctx, tlp_a, tlp_b, tlp_c, psmode);
+
+    if (fmt != CVK_FMT_BF16) {
+      tlp_c->shape.n *= sizeof(int);  // partial sum for 32bit output
+    }
+    shape.row = tlp_c->shape.n;
+    shape.col = tlp_c->shape.col;
+    stride.row = (uint32_t)N * fmt_byte_sz;  // place with no tiling
+    store_matrix(ctx, tlp_c, C_slice_global_offset, shape, stride, fmt);
+
+    ctx->ops->lmem_free_matrix(ctx, tlp_c);
+    ctx->ops->lmem_free_matrix(ctx, tlp_a);
+  }
+  ctx->ops->lmem_free_matrix(ctx, tlp_b);
+}
+
+static void strategy_slice_on_N(cvk_context_t *ctx, size_t M, size_t N, size_t K, uint64_t gaddr_a,
+                                uint64_t gaddr_b, uint64_t gaddr_c, size_t slice_num,
+                                cvk_fmt_t fmt) {
+  cvk_ml_t *tlp_a;
+  int fmt_byte_sz = get_fmt_byte_sz(fmt);
+
+  cvk_ml_shape_t s_a = ctx->ops->ml_default_shape(ctx, M, K, fmt);
+  tlp_a = ctx->ops->lmem_alloc_matrix(ctx, s_a, fmt, CTRL_AL);
+
+  cvk_mg_stride_t stride;
+  stride.row = (uint32_t)K * fmt_byte_sz;
+  cvk_mg_shape_t shape;
+  shape.row = tlp_a->shape.n;
+  shape.col = tlp_a->shape.col;
+  load_matrix(ctx, tlp_a, gaddr_a, shape, stride, fmt);
+
+  int pack_shift = 0;
+  int psmode = 0;  // default for bf16
+  for (size_t slice_idx = 0; slice_idx < slice_num; slice_idx++) {
+    size_t N_slice = N / slice_num + (slice_idx < N % slice_num);
+
+    cvk_ml_shape_t s_b = ctx->ops->ml_default_shape(ctx, K, N_slice, fmt);
+    cvk_ml_t *tlp_b;
+    tlp_b = ctx->ops->lmem_alloc_matrix(ctx, s_b, fmt, CTRL_AL);
+
+    uint64_t B_slice_global_offset =
+        get_slice_global_offset(gaddr_b, 0, slice_idx, K, N, 1, slice_num, fmt);
+
+    // load b
+    stride.row = (uint32_t)N * fmt_byte_sz;
+    shape.row = tlp_b->shape.n;
+    shape.col = tlp_b->shape.col;
+    load_matrix(ctx, tlp_b, B_slice_global_offset, shape, stride, fmt);
+
+    // c for answer
+    cvk_ml_shape_t s_c = ctx->ops->ml_default_shape(ctx, M, N_slice, fmt);
+    cvk_ml_t *tlp_c;
+    if (fmt == CVK_FMT_BF16) {
+      tlp_c = ctx->ops->lmem_alloc_matrix(ctx, s_c, fmt, CTRL_AL);
+    } else {
+      s_c.n = tlp_a->shape.n;
+      s_c.c = tlp_b->shape.c;
+      s_c.w = tlp_b->shape.w;
+      s_c.col = tlp_b->shape.col;
+
+      tlp_c = ctx->ops->lmem_alloc_ps32_matrix(ctx, s_c, fmt, CTRL_AL);
+      psmode = 2;
+    }
+
+    uint64_t C_slice_global_offset =
+        get_slice_global_offset(gaddr_c, 0, slice_idx, M, N, 1, slice_num, fmt);
+    if (fmt != CVK_FMT_BF16) {
+      // int32 pack
+      C_slice_global_offset = get_slice_global_offset(gaddr_c, 0, 0, M, N, 1, slice_num, fmt);
+      C_slice_global_offset += pack_shift;
+      pack_shift += tlp_c->shape.col;
+      // C_slice_global_offset =
+      //  get_slice_global_offset(gaddr_c, 0, slice_idx, M*2, N*2, 1, slice_num, fmt);
+    }
+
+    _matrix_multiplication(ctx, tlp_a, tlp_b, tlp_c, psmode);
+
+    if (fmt != CVK_FMT_BF16) {
+      tlp_c->shape.n *= sizeof(int);  // partial sum for 32bit output
+    }
+    shape.row = tlp_c->shape.n;
+    shape.col = tlp_c->shape.col;
+    stride.row = (uint32_t)N * fmt_byte_sz;
+    store_matrix(ctx, tlp_c, C_slice_global_offset, shape, stride, fmt);
+
+    ctx->ops->lmem_free_matrix(ctx, tlp_c);
+    ctx->ops->lmem_free_matrix(ctx, tlp_b);
+  }
+
+  ctx->ops->lmem_free_matrix(ctx, tlp_a);
+}
+
+static void slice_split_strategy(cvk_context_t *ctx, size_t M, size_t N, size_t K,
+                                 size_t *slice_num, cvk_fmt_t fmt) {
+  int fmt_byte_sz = get_fmt_byte_sz(fmt);
+  size_t W_param = ctx->info.eu_num;
+  size_t channel_size_local = get_neuron_csize_local(ctx, 1, W_param, fmt);
+#ifdef SMALL_TEST
+  size_t bank_size = (2048 / LOCAL_MEM_BANKS / fmt_byte_sz);
+#else
+  size_t bank_size = (ctx->info.lmem_size / LOCAL_MEM_BANKS / fmt_byte_sz);
+#endif
+  size_t bank_size_half = bank_size >> 1;
+  slice_num[0] = slice_num[1] = slice_num[2] = 1;
+
+  // input blob
+  size_t C_param = (K + W_param - 1) / W_param;
+  size_t local_size_A = M * (ceiling_func_shift(C_param, NPU_SHIFT)) * channel_size_local;
+  size_t slice_num_A = (local_size_A + bank_size_half - 1) / (bank_size_half);
+  size_t col_slice_time_A = ceiling_func_shift(C_param, NPU_SHIFT);
+  size_t row_slice_time_A = (slice_num_A < M) ? slice_num_A : M;
+
+  // weight blob
+  C_param = (N + W_param - 1) / W_param;
+  size_t local_size_B = K * (ceiling_func_shift(C_param, NPU_SHIFT)) * channel_size_local;
+  size_t slice_num_B = (local_size_B + bank_size_half - 1) / bank_size_half;
+  size_t row_slice_time_B = (slice_num_B < K) ? slice_num_B : K;
+
+  // output blob
+  C_param = (N + W_param - 1) / W_param;
+  // multi 2 for simulating result add
+  int outputs_nr = 2;
+  if (fmt != CVK_FMT_BF16) {
+    // int8 output 32bit result
+    // outputs_nr = 4;
+  }
+  size_t local_size_C = (M + 1) * (ceiling_func_shift(C_param, NPU_SHIFT)) * channel_size_local;
+  size_t slice_num_C = (local_size_C + bank_size * outputs_nr - 1) / (bank_size * outputs_nr);
+  size_t col_slice_time_C = ceiling_func_shift(C_param, NPU_SHIFT);
+
+  // A
+  if (col_slice_time_A == 0) {
+    slice_num[0] = row_slice_time_A;
+  } else {
+    if (col_slice_time_A < slice_num_A) {
+      slice_num[0] = (slice_num_A + col_slice_time_A - 1) / col_slice_time_A;
+    } else {
+      slice_num[0] = 1;
+      slice_num[2] = slice_num_A;
+    }
+  }
+
+  // C
+  if ((slice_num_C > slice_num[0]) && col_slice_time_C) {
+    size_t tmp = (slice_num_C + slice_num[0] - 1) / slice_num[0];
+    slice_num[1] = (col_slice_time_C > tmp) ? tmp : col_slice_time_C;
+  }
+
+  // B
+  if (slice_num_B > slice_num[1]) {
+    size_t tmp = (slice_num_B + slice_num[1] - 1) / slice_num[1];
+    slice_num[2] = get_max(slice_num[2], (row_slice_time_B > tmp) ? tmp : row_slice_time_B);
+  }
+  // fine-tuning
+  size_t matrix_shape[3] = {1, 1, 1};
+  while (true) {
+    matrix_shape[0] = (M + slice_num[0] - 1) / slice_num[0];
+    matrix_shape[2] = (N + slice_num[1] - 1) / slice_num[1];
+    matrix_shape[1] = (K + slice_num[2] - 1) / slice_num[2];
+    size_t C_param_input_col = (matrix_shape[1] + W_param - 1) / W_param;
+    size_t C_param_weight_col = (matrix_shape[2] + W_param - 1) / W_param;
+
+    size_t local_size_B =
+        matrix_shape[1] * (ceiling_func_shift(C_param_weight_col, NPU_SHIFT)) * channel_size_local;
+    size_t local_size_C =
+        matrix_shape[0] * (ceiling_func_shift(C_param_weight_col, NPU_SHIFT)) * channel_size_local;
+    size_t local_size_A =
+        matrix_shape[0] * (ceiling_func_shift(C_param_input_col, NPU_SHIFT)) * channel_size_local;
+    bool slicing_success = (local_size_A <= bank_size_half) &&
+                           (local_size_C <= bank_size * outputs_nr) &&
+                           (local_size_B <= bank_size_half);
+
+    if (slicing_success) {
+      if (slice_num[1] > 1) {
+        int N_silce = N / slice_num[1];
+        int N_floor = (N_silce / ctx->info.eu_num) * ctx->info.eu_num;
+        ASSERT(N_floor);
+        slice_num[1] = ceiling_func(N, N_floor);
+      }
+#if 0  // def DEBUG_LOCAL
+      size_t bias_local_size =
+          (ceiling_func_shift(C_param_weight_col, NPU_SHIFT)) * channel_size_local;
+      // DBG("multi-dim slicing:\n");
+      DBG("local_size_B = %lu\n", local_size_B);
+      DBG("local_size_C = %lu\n", local_size_C);
+      DBG("local_size_A =  %lu\n", local_size_A);
+      DBG("bias_local_size =   %lu\n", bias_local_size);
+#endif
+      return;
+    } else if (local_size_A > bank_size_half) {
+      slice_num[2]++;
+    } else if (local_size_B > bank_size_half) {
+      slice_num[2]++;
+    } else if (local_size_C > 2 * bank_size) {
+      slice_num[0]++;
+    }
+  }
+}
+
+static void strategy_slice_on_multidim_init(cvk_context_t *ctx, gaddr_t *slice_global_offset,
+                                            size_t *matrix_shape, size_t *slice_row_stride,
+                                            cvk_fmt_t fmt) {
+  int fmt_byte_sz = get_fmt_byte_sz(fmt);
+  gaddr_t global_offset_A = slice_global_offset[0];
+  gaddr_t global_offset_B = slice_global_offset[1];
+  size_t row_num_A = matrix_shape[0];
+  size_t col_num_A = matrix_shape[1];
+  size_t col_num_B = matrix_shape[2];
+
+  cvk_ml_shape_t s_A, s_B;
+  s_A = ctx->ops->ml_default_shape(ctx, row_num_A, col_num_A, fmt);
+  s_B = ctx->ops->ml_default_shape(ctx, col_num_A, col_num_B, fmt);
+
+  cvk_mg_stride_t st_A, st_B;
+  st_A.row = (uint32_t)slice_row_stride[0] * fmt_byte_sz;
+  st_B.row = (uint32_t)slice_row_stride[1] * fmt_byte_sz;
+
+#ifdef SMALL_TEST
+  size_t bank_size = 2048 / LOCAL_MEM_BANKS;
+#else
+  size_t bank_size = ctx->info.lmem_size / LOCAL_MEM_BANKS * 2;
+#endif
+  cvk_ml_t *tl_A = bmk1880v2_matrix_lmem_prealloc_align(ctx, &matrix_lmem[0], 0, s_A, fmt, CTRL_AL);
+  cvk_ml_t *tl_B =
+      bmk1880v2_matrix_lmem_prealloc_align(ctx, &matrix_lmem[1], bank_size, s_B, fmt, CTRL_AL);
+  cvk_mg_shape_t shape;
+  shape.row = tl_A->shape.n;
+  shape.col = tl_A->shape.col;
+  load_matrix(ctx, tl_A, global_offset_A, shape, st_A, fmt);
+
+  shape.row = tl_B->shape.n;
+  shape.col = tl_B->shape.col;
+  load_matrix(ctx, tl_B, global_offset_B, shape, st_B, fmt);
+  // DBG("0->load from %u/%u,off %lu/%lu\n", tl_A->start_address, tl_B->start_address,
+  // global_offset_A,
+  //    global_offset_B);
+
+  bool is_alloc_from_stack = true;
+  bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_B);
+  bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_A);
+}
+
+static int strategy_slice_on_multi_dimension_internal(
+    cvk_context_t *ctx, size_t *slice_idx, size_t *slice_num, gaddr_t *slice_global_offset,
+    size_t *matrix_shape, gaddr_t *slice_global_offset_next, size_t *matrix_shape_next,
+    size_t *slice_row_stride, cvk_fmt_t fmt) {
+  int fmt_byte_sz = get_fmt_byte_sz(fmt);
+  size_t row_num_A = matrix_shape[0];
+  size_t col_num_A = matrix_shape[1];
+  size_t col_num_B = matrix_shape[2];
+
+  gaddr_t global_offset_next_A = slice_global_offset_next[0];
+  gaddr_t global_offset_next_B = slice_global_offset_next[1];
+
+  size_t row_num_next_A = matrix_shape_next[0];
+  size_t col_num_next_A = matrix_shape_next[1];
+  size_t col_num_next_B = matrix_shape_next[2];
+
+  cvk_ml_shape_t s_next_A = ctx->ops->ml_default_shape(ctx, row_num_next_A, col_num_next_A, fmt);
+  cvk_ml_shape_t s_next_B = ctx->ops->ml_default_shape(ctx, col_num_next_A, col_num_next_B, fmt);
+  cvk_ml_shape_t s_A, s_B, s_C;
+
+  s_A = ctx->ops->ml_default_shape(ctx, row_num_A, col_num_A, fmt);
+  s_B = ctx->ops->ml_default_shape(ctx, col_num_A, col_num_B, fmt);
+  s_C = ctx->ops->ml_default_shape(ctx, row_num_A, col_num_B, fmt);
+
+  // int partition = 2; // 2 means one for A/B, another for C with double output
+  // if (fmt != CVK_FMT_BF16) {
+  //  partition = 6;
+  //}
+  // size_t bank_size = ctx->info.lmem_size / LOCAL_MEM_BANKS * 2;
+#ifdef SMALL_TEST
+  size_t bank_size = 2048 / LOCAL_MEM_BANKS;
+#else
+  size_t bank_size = ctx->info.lmem_size / LOCAL_MEM_BANKS * 2;
+#endif
+  size_t hf_bsize = bank_size / 2;
+  size_t cur = slice_idx[1] % 2;
+  size_t next = (slice_idx[1] + 1) % 2;
+
+  int output_nr = 2;  // 2 means output with low part and high part
+  int psmode = 1;     // default for bf16
+  if (fmt != CVK_FMT_BF16) {
+    // output_nr = 4; // 4 for 32bit output with 4 * 1 byte output
+    psmode = 3;
+  }
+
+  cvk_mg_stride_t st_A, st_C, st_B;
+  st_A.row = (uint32_t)slice_row_stride[0] * fmt_byte_sz;
+  st_C.row = (uint32_t)slice_row_stride[1] * fmt_byte_sz;
+  st_B.row = (uint32_t)slice_row_stride[1] * fmt_byte_sz;
+
+  cvk_ml_t *tl_A =
+      bmk1880v2_matrix_lmem_prealloc_align(ctx, &matrix_lmem[0], hf_bsize * cur, s_A, fmt, CTRL_AL);
+  cvk_ml_t *tl_next_A = bmk1880v2_matrix_lmem_prealloc_align(ctx, &matrix_lmem[1], hf_bsize * next,
+                                                             s_next_A, fmt, CTRL_AL);
+  cvk_ml_t *tl_B = bmk1880v2_matrix_lmem_prealloc_align(
+      ctx, &matrix_lmem[2], bank_size + hf_bsize * cur, s_B, fmt, CTRL_AL);
+  cvk_ml_t *tl_next_B = bmk1880v2_matrix_lmem_prealloc_align(
+      ctx, &matrix_lmem[3], bank_size + hf_bsize * next, s_next_B, fmt, CTRL_AL);
+  cvk_ml_t *tl_C = bmk1880v2_matrix_lmem_prealloc_align(ctx, &matrix_lmem[4], output_nr * bank_size,
+                                                        s_C, fmt, CTRL_AL);
+
+  ctx->ops->parallel_enable(ctx);
+
+  if (slice_num[2] - 1 > slice_idx[1]) {
+    st_A.row = (uint32_t)slice_row_stride[0] * fmt_byte_sz;
+    st_B.row = (uint32_t)slice_row_stride[1] * fmt_byte_sz;
+
+    cvk_mg_shape_t shape;
+    shape.row = tl_next_A->shape.n;
+    shape.col = tl_next_A->shape.col;
+    load_matrix(ctx, tl_next_A, global_offset_next_A, shape, st_A, fmt);
+
+    shape.row = tl_next_B->shape.n;
+    shape.col = tl_next_B->shape.col;
+    load_matrix(ctx, tl_next_B, global_offset_next_B, shape, st_B, fmt);
+
+    // DBG("do %u/%u ", tl_A->start_address, tl_B->start_address);
+#define PS32_CTRL_RA (3)   /* normal case */
+#define PS32_CTRL_NULL (2) /* first one */
+    int pint32_t_status = slice_idx[1] > 0 ? PS32_CTRL_RA : PS32_CTRL_NULL;
+    _matrix_multiplication(ctx, tl_A, tl_B, tl_C, pint32_t_status);
+    // DBG(">load from %u/%u off %lu/%lu s(%d)\n", tl_next_A->start_address,
+    // tl_next_B->start_address,
+    //    global_offset_next_A, global_offset_next_B, pint32_t_status);
+  }
+
+  if (slice_idx[1] == slice_num[2] - 1) {
+    // last one
+    // not using ps mode 1 cuz it could saturate from 32bit to 16 bit
+    _matrix_multiplication(ctx, tl_A, tl_B, tl_C, psmode);
+  }
+
+  ctx->ops->parallel_disable(ctx);
+
+  if (slice_idx[1] == slice_num[2] - 1) {
+    // last one
+    cvk_mg_shape_t shape;
+    if (fmt != CVK_FMT_BF16) {
+      tl_C->shape.n *= sizeof(int);  // partial sum for 32bit output
+    }
+    shape.row = tl_C->shape.n;
+    shape.col = tl_C->shape.col;
+    st_C.row = (uint32_t)slice_row_stride[1] * fmt_byte_sz;
+
+    store_matrix(ctx, tl_C, slice_global_offset[2], shape, st_C, fmt);
+    // DBG("local memory a/b/c is %u/%u/%u, store to slice_global_offset[2] %lu\n",
+    //    tl_A->start_address, tl_B->start_address, tl_C->start_address, slice_global_offset[2]);
+  }
+
+  bool is_alloc_from_stack = true;
+
+  if (cur == 0) {
+    bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_next_B);
+    bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_B);
+    bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_next_A);
+    bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_A);
+  } else {
+    bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_B);
+    bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_next_B);
+    bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_A);
+    bmk1880v2_lmem_free_prealloc_bf16_matrix(ctx, is_alloc_from_stack, tl_next_A);
+  }
+
+  return 0;
+}
+
+static void strategy_slice_on_multi_dimension(cvk_context_t *ctx, gaddr_t global_offset_A,
+                                              gaddr_t global_offset_B, gaddr_t global_offset_C,
+                                              size_t M, size_t N, size_t K, size_t *slice_num,
+                                              cvk_fmt_t fmt) {
+  size_t slice_row_stride[4] = {0, 0, 0, 0};
+  slice_row_stride[0] = K;
+  slice_row_stride[1] = N;
+
+  gaddr_t slice_global_offset[3] = {0, 0, 0};
+  gaddr_t slice_global_offset_next[4] = {0, 0, 0, 0};
+  size_t slice_idx[3] = {0, 0, 0};
+  size_t matrix_shape[3] = {0, 0, 0};
+  size_t matrix_shape_next[3] = {0, 0, 0};
+  int slice_idx_0 = slice_idx[0];
+  int slice_idx_2 = slice_idx[2];
+  int pack_shift = 0;
+  for (slice_idx[0] = 0; slice_idx[0] < slice_num[0]; slice_idx[0]++) {
+    matrix_shape[0] = M / slice_num[0] + (slice_idx[0] < M % slice_num[0]);
+    matrix_shape_next[0] = M / slice_num[0] + (0 + slice_idx[0] < M % slice_num[0]);
+    for (slice_idx[2] = 0; slice_idx[2] < slice_num[1]; slice_idx[2]++) {
+      matrix_shape[2] = N / slice_num[1] + (slice_idx[2] < N % slice_num[1]);
+      matrix_shape_next[2] = N / slice_num[1] + (0 + slice_idx[1] < N % slice_num[1]);
+      for (slice_idx[1] = 0; slice_idx[1] < slice_num[2]; slice_idx[1]++) {
+        matrix_shape[1] = K / slice_num[2] + (slice_idx[1] < K % slice_num[2]);
+        matrix_shape_next[1] = K / slice_num[2] + (1 + slice_idx[1] < K % slice_num[2]);
+        slice_global_offset[0] = get_slice_global_offset(
+            global_offset_A, slice_idx[0], slice_idx[1], M, K, slice_num[0], slice_num[2], fmt);
+        slice_global_offset[1] = get_slice_global_offset(
+            global_offset_B, slice_idx[1], slice_idx[2], K, N, slice_num[2], slice_num[1], fmt);
+        // the low 8-bits of C
+        if (fmt == CVK_FMT_BF16) {
+          slice_global_offset[2] = get_slice_global_offset(
+              global_offset_C, slice_idx[0], slice_idx[2], M, N, slice_num[0], slice_num[1], fmt);
+        } else {
+          slice_global_offset[2] = get_slice_global_offset(
+              global_offset_C, slice_idx_0, slice_idx_2, M, N, slice_num[0], slice_num[1], fmt);
+          if (slice_idx[1] == slice_num[2] - 1) {
+            // only shift in real store
+            slice_global_offset[2] += pack_shift;
+            // FIXME: slice N, currently ONLY slice M and K
+            size_t row_num_A = matrix_shape[0];
+            size_t col_num_B = matrix_shape[2];
+            pack_shift += (row_num_A * col_num_B * sizeof(int));
+          }
+        }
+
+        slice_global_offset_next[0] = get_slice_global_offset(
+            global_offset_A, slice_idx[0], slice_idx[1] + 1, M, K, slice_num[0], slice_num[2], fmt);
+        slice_global_offset_next[1] = get_slice_global_offset(
+            global_offset_B, slice_idx[1] + 1, slice_idx[2], K, N, slice_num[2], slice_num[1], fmt);
+        // DBG("=>(%s)slice_global_offset[0](%lu)/slice_global_offset[1](%lu) for slice_idx[1](%lu)
+        // "
+        //    "== 0\n"
+        //    ", (%s)slice_global_offset[2](%lu) for slice_idx[1](%lu) == slice_num[2](%lu) - 1\n"
+        //    ", (%s)(next)slice_global_offset_next[0](%lu)/slice_global_offset_next[1](%lu) for "
+        //    "slice_num[2](%lu) - 1 > slice_idx[1](%lu)\n"
+        //    "next ctrl:%s, store ctrl:%s\n",
+        //    slice_idx[1] == 0 ? "en" : " ", slice_global_offset[0], slice_global_offset[1],
+        //    slice_idx[1], slice_idx[1] == slice_num[2] - 1 ? "en" : " ", slice_global_offset[2],
+        //    slice_idx[1], slice_num[2], slice_num[2] - 1 > slice_idx[1] ? "en" : " ",
+        //    slice_global_offset_next[0], slice_global_offset_next[1], slice_num[2], slice_idx[1],
+        //    (slice_idx[1] > 0) ? "CTRL_RA" : "CTRL_NULL",
+        //    (slice_num[2] > 1) ? "CTRL_RA" : "CTRL_NULL");
+
+        if (slice_idx[1] == 0) {
+          strategy_slice_on_multidim_init(ctx, slice_global_offset, matrix_shape, slice_row_stride,
+                                          fmt);
+        }
+
+        strategy_slice_on_multi_dimension_internal(ctx, slice_idx, slice_num, slice_global_offset,
+                                                   matrix_shape, slice_global_offset_next,
+                                                   matrix_shape_next, slice_row_stride, fmt);
+      }
+    }
+  }
+}
+
+size_t *bmblas_gemm(cvk_context_t *ctx, size_t M, size_t N, size_t K, uint64_t gaddr_a,
+                    uint64_t gaddr_b, uint64_t gaddr_c, cvk_fmt_t fmt) {
+  size_t slice_num[3] = {1, 1, 1};
+  ASSERT(slice_num[0] <= M && slice_num[0] >= 1);
+  ASSERT(slice_num[1] <= N && slice_num[1] >= 1);
+  ASSERT(slice_num[2] <= K && slice_num[2] >= 1);
+
+  size_t strategy = get_slice_num(ctx, M, N, K, slice_num, fmt);
+  // printf("strategy: %lu\n slice %lu %lu %lu\n", strategy, slice_num[0], slice_num[1],
+  // slice_num[2]);
+
+  switch (strategy) {
+    case 0: {
+      strategy_no_slice(ctx, M, N, K, gaddr_a, gaddr_b, gaddr_c, fmt);
+    } break;
+    case 1: {
+      strategy_slice_on_M(ctx, M, N, K, gaddr_a, gaddr_b, gaddr_c, slice_num[0], fmt);
+    } break;
+    case 2: {
+      strategy_slice_on_N(ctx, M, N, K, gaddr_a, gaddr_b, gaddr_c, slice_num[1], fmt);
+    } break;
+    case 3: {
+      slice_split_strategy(ctx, M, N, K, slice_num, fmt);
+      // printf("slice all, %lu %lu %lu\n", slice_num[0], slice_num[1], slice_num[2]);
+
+      strategy_slice_on_multi_dimension(ctx, gaddr_a, gaddr_b, gaddr_c, M, N, K, slice_num, fmt);
+    }
+    default:
+      break;
+  }
+  // 3 indicate M N K
+  int slice_num_len = 4 * sizeof(size_t);
+  size_t *_slice_num = (size_t *)malloc(slice_num_len);
+  memcpy(_slice_num, slice_num, 3 * sizeof(size_t));
+  _slice_num[3] = strategy;
+  return _slice_num;
+}
+
+size_t *cvm_gemm(cvk_context_t *ctx, gaddr_t bottom_data_gaddr, gaddr_t weight_data_gaddr,
+                 gaddr_t top_data_gaddr, int in_row, int in_col, int out_col, cvk_fmt_t fmt) {
+  size_t *slice_num = NULL;
+  if (0) {
+    // backend impelement
+    cvm_fc_forward_kernel(ctx, 0, bottom_data_gaddr, weight_data_gaddr, GADDR_INVALID,
+                          top_data_gaddr, in_row, in_col, out_col, 0, 0, 0);
+  } else {
+    slice_num = bmblas_gemm(ctx, in_row, out_col, in_col, bottom_data_gaddr, weight_data_gaddr,
+                            top_data_gaddr, fmt);
+  }
+  return slice_num;
+}
+
+int cvm_combin_gemm_i8(size_t *slice_num, uint8_t *i8_C, uint32_t *i32_C, int M, int N) {
+  int bstride = M * N;
+  int size = bstride;
+
+  int strategy = slice_num[3];
+  int chunks = slice_num[0] * slice_num[1] * slice_num[2];
+  int chunk_size = M * N / chunks;
+  size = chunk_size;
+  bstride = chunk_size;
+  if (strategy == 0 || strategy == 2) {
+    // slice N
+    int pack_shift = 0;
+    int N_slice_cnt = 0;
+    for (int tiling = 0; tiling < chunks; tiling++) {
+      size_t N_slice = N / slice_num[1] + (tiling < (int)(N % slice_num[1]));
+      chunk_size = N_slice * M;
+      size = chunk_size;
+      bstride = M * N;
+      for (int m = 0; m < (int)M; m++) {
+        for (int n = 0; n < (int)N_slice; n++) {
+          int shift = N_slice_cnt + m * N + n;
+          i32_C[shift] = (i8_C[shift + bstride * 0]) | (i8_C[shift + bstride * 1] << 8) |
+                         (i8_C[shift + bstride * 2] << 16) | (i8_C[shift + bstride * 3] << 24);
+        }
+      }
+      pack_shift += size;
+      N_slice_cnt += N_slice;
+    }
+  } else if (strategy == 1) {
+    int pack_shift = 0;
+    for (int tiling = 0; tiling < chunks; tiling++) {
+      size_t M_slice = M / slice_num[0] + (tiling < (int)(M % slice_num[0]));
+      chunk_size = M_slice * N;
+      size = chunk_size;
+      bstride = chunk_size;
+      for (int i = 0; i < size; i++) {
+        i32_C[pack_shift + i] = (i8_C[pack_shift * sizeof(int) + i + bstride * 0]) |
+                                (i8_C[pack_shift * sizeof(int) + i + bstride * 1] << 8) |
+                                (i8_C[pack_shift * sizeof(int) + i + bstride * 2] << 16) |
+                                (i8_C[pack_shift * sizeof(int) + i + bstride * 3] << 24);
+      }
+      pack_shift += size;
+    }
+  } else if (strategy == 3) {
+    // tiling all, it MUST tiling M/K ONLY
+    // FIXME: tiling N
+    int pack_shift = 0;
+    for (int tiling = 0; tiling < (int)slice_num[0]; tiling++) {
+      size_t M_slice = M / slice_num[0] + (tiling < (int)(M % slice_num[0]));
+      int size = M_slice * N;
+      int bstride = size;
+      for (int i = 0; i < size; i++) {
+        i32_C[pack_shift + i] = (i8_C[pack_shift * sizeof(int) + i + bstride * 0]) |
+                                (i8_C[pack_shift * sizeof(int) + i + bstride * 1] << 8) |
+                                (i8_C[pack_shift * sizeof(int) + i + bstride * 2] << 16) |
+                                (i8_C[pack_shift * sizeof(int) + i + bstride * 3] << 24);
+      }
+      pack_shift += size;
+    }
+  }
+  return 0;
+}
diff --git a/cvimath/src/blas_cpu.cpp b/cvimath/src/blas_cpu.cpp
new file mode 100644
index 000000000..7b091116b
--- /dev/null
+++ b/cvimath/src/blas_cpu.cpp
@@ -0,0 +1,82 @@
+#include <cvimath_internal.h>
+
+#include <bits/stdc++.h>
+#ifdef __ARM_ARCH
+#include <arm_neon.h>
+#endif
+
+template <typename T>
+void k_selection_sort_index(T *array, uint32_t *index, T *value, const uint32_t array_size,
+                            const uint32_t k) {
+  for (uint32_t i = 0; i < k; i++) {
+    int largest = 0;
+    for (uint32_t j = 0; j < array_size; j++) {
+      if (array[j] > array[largest]) {
+        largest = j;
+      }
+    }
+    value[i] = array[largest];
+    index[i] = largest;
+    array[largest] = 0;
+  }
+}
+
+inline uint32_t dot(uint8_t *a, uint8_t *b, uint32_t data_length) {
+  uint32_t dot_result = 0;
+  for (uint32_t i = 0; i < data_length; i++) {
+    dot_result += ((short)a[i] * b[i]);
+  }
+  return dot_result;
+}
+
+inline int32_t dot_i8(int8_t *a, int8_t *b, uint32_t data_length) {
+  int32_t dot_result = 0;
+  for (uint32_t i = 0; i < data_length; i++) {
+    dot_result += ((short)a[i] * b[i]);
+  }
+  return dot_result;
+}
+
+void cvm_gen_precached_i8_unit_length(int8_t *precached, float *unit_precached_arr,
+                                      const uint32_t data_length, const uint32_t data_num) {
+  for (uint32_t i = 0; i < data_num; i++) {
+    int8_t *fb_offset = precached + i * data_length;
+    unit_precached_arr[i] = dot_i8(fb_offset, fb_offset, data_length);
+    unit_precached_arr[i] = sqrt(unit_precached_arr[i]);
+  }
+}
+
+void cvm_gen_precached_u8_unit_length(uint8_t *precached, float *unit_precached_arr,
+                                      const uint32_t data_length, const uint32_t data_num) {
+  for (uint32_t i = 0; i < data_num; i++) {
+    uint8_t *fb_offset = precached + i * data_length;
+    unit_precached_arr[i] = dot(fb_offset, fb_offset, data_length);
+    unit_precached_arr[i] = sqrt(unit_precached_arr[i]);
+  }
+}
+
+void cvm_cpu_i8data_ip_match(int8_t *feature, int8_t *precached, float *unit_precached_arr,
+                             uint32_t *k_index, float *k_value, float *buffer,
+                             const uint32_t data_length, const uint32_t data_num,
+                             const uint32_t k) {
+  float unit_feature = (float)dot_i8(feature, feature, data_length);
+  unit_feature = sqrt(unit_feature);
+  for (uint32_t i = 0; i < data_num; i++) {
+    buffer[i] = dot_i8(feature, precached + i * data_length, data_length) /
+                (unit_feature * unit_precached_arr[i]);
+  }
+  k_selection_sort_index(buffer, k_index, k_value, data_num, k);
+}
+
+void cvm_cpu_u8data_ip_match(uint8_t *feature, uint8_t *precached, float *unit_precached_arr,
+                             uint32_t *k_index, float *k_value, float *buffer,
+                             const uint32_t data_length, const uint32_t data_num,
+                             const uint32_t k) {
+  float unit_feature = (float)dot(feature, feature, data_length);
+  unit_feature = sqrt(unit_feature);
+  for (uint32_t i = 0; i < data_num; i++) {
+    buffer[i] = dot(feature, precached + i * data_length, data_length) /
+                (unit_feature * unit_precached_arr[i]);
+  }
+  k_selection_sort_index(buffer, k_index, k_value, data_num, k);
+}
\ No newline at end of file
diff --git a/cvimath/src/chl_quan.cpp b/cvimath/src/chl_quan.cpp
new file mode 100644
index 000000000..e97c3ea12
--- /dev/null
+++ b/cvimath/src/chl_quan.cpp
@@ -0,0 +1,118 @@
+#include <cvimath_internal.h>
+
+#include <assert.h>
+#include <limits.h>
+#include <math.h>
+#include <iostream>
+
+void cvm_get_chl_quan(float real_multiplier, uint32_t *quantized_multiplier, int *right_shift) {
+  if (real_multiplier <= 0.f || real_multiplier > 1.f) {
+    std::cerr << "Multiplier should be bigger than 0, smaller or euqal to 1." << std::endl;
+    *quantized_multiplier = 0;
+    *right_shift = 0;
+    return;
+  } else if (real_multiplier == 1.f) {
+    *quantized_multiplier = (uint32_t)(1ll << 31) - 1;
+    *right_shift = 0;
+  } else {
+    int s = 0;
+    // We want to bring the real multiplier into the interval [1/2, 1).
+    // We can do so by multiplying it by two, and recording how many times
+    // we multiplied by two so that we can compensate that by a right
+    // shift by the same amount.
+    while (real_multiplier < 0.5f) {
+      real_multiplier *= 2.0f;
+      s++;
+    }
+    // Now that the real multiplier is in [1/2, 1), we convert it
+    // into a fixed-point number.
+    int64_t q = static_cast<int64_t>(round(real_multiplier * (1ll << 31)));
+    assert(q <= (1ll << 31));
+    // Handle the special case when the real multiplier was so close to 1
+    // that its fixed-point approximation was undistinguishable from 1.
+    // We handle this by dividing it by two, and remembering to decrement
+    // the right shift amount.
+    if (q == (1ll << 31)) {
+      q /= 2;
+      s--;
+    }
+    assert(s >= 0);
+    assert(q <= (int64_t)LONG_MAX);
+    *quantized_multiplier = (uint32_t)q;
+    *right_shift = s;
+  }
+}
+
+inline void cvm_pack_per_chan_cal_data(uint32_t channels, bool has_bias, int32_t *bias,
+                                       uint32_t *multiplier, int8_t *shift, uint8_t *packed_data) {
+  uint8_t *ptr = packed_data;
+
+  for (uint32_t i = 0; i < channels; i++) {
+    if (has_bias) {
+      uint32_t val = (uint32_t)bias[i];
+      *ptr = val & 0xff;
+      ptr++;
+      *ptr = (val >> 8) & 0xff;
+      ptr++;
+      *ptr = (val >> 16) & 0xff;
+      ptr++;
+      *ptr = (val >> 24) & 0xff;
+      ptr++;
+    }
+
+    {
+      uint32_t val = multiplier[i];
+      *ptr = val & 0xff;
+      ptr++;
+      *ptr = (val >> 8) & 0xff;
+      ptr++;
+      *ptr = (val >> 16) & 0xff;
+      ptr++;
+      *ptr = (val >> 24) & 0xff;
+      ptr++;
+    }
+
+    {
+      uint8_t val = shift[i];
+      *ptr = val;
+      ptr++;
+    }
+  }
+}
+
+void cvm_fill_chl_quan_data(const uint32_t c, const uint32_t quantized_multiplier,
+                            const int right_shift, uint8_t *cal_data, int32_t *bias_data,
+                            bool has_bias) {
+  // Create tl_multiplier
+  uint32_t *multiplier_data = new uint32_t[c];
+  int8_t *shift_data = new int8_t[c];
+  for (unsigned int i = 0; i < c; ++i) {
+    // multipliers typically range in [2^30 ; 2^31 - 1].
+    // Values in [0, 2^30 - 1] are normally unused, but harmless.
+    // Thus a good way to randomize multipliers is to subtract from them
+    // a random value smaller than 2^30 but still significant compared to it.
+    multiplier_data[i] = quantized_multiplier;
+
+    // Our H/W only supports right shift
+    shift_data[i] = right_shift > 0 ? right_shift : 0;
+
+#ifdef ENABLE_DEBUG_MSG
+    printf("      [oc=%d] multiplier_data %d, shift_data %d\n", i, p_param->multiplier_data[i],
+           p_param->shift_data[i]);
+#endif
+  }
+
+  cvm_pack_per_chan_cal_data(c, has_bias, bias_data, multiplier_data, shift_data, cal_data);
+  delete[] multiplier_data;
+  delete[] shift_data;
+}
+
+uint8_t *cvm_get_chl_quan_data(const uint32_t c, const uint32_t quantized_multiplier,
+                               const int &right_shift, int32_t *bias_data, bool has_bias) {
+  const int per_chan_cal_data_size =
+      has_bias ? CVK_MULTIPLIER_BIAS_PACKED_DATA_SIZE : CVK_MULTIPLIER_ONLY_PACKED_DATA_SIZE;
+  const int cal_data_size = c * per_chan_cal_data_size;
+  uint8_t *cal_data = (uint8_t *)malloc(cal_data_size);
+  cvm_fill_chl_quan_data(c, quantized_multiplier, right_shift, cal_data, bias_data, has_bias);
+  return cal_data;
+}
diff --git a/cvimath/src/common.c b/cvimath/src/common.c
new file mode 100644
index 000000000..534d489e1
--- /dev/null
+++ b/cvimath/src/common.c
@@ -0,0 +1,1032 @@
+/**
+ * \breif common wrap function for lut
+ */
+#include <cvimath_internal.h>
+#include "gen_lut.h"  // NOLINT
+
+void cvm_table_shape(cvk_context_t* ctx, cvk_tl_shape_t* s) {
+  // MUST valid
+  ASSERT(s);
+
+  uint32_t npu_num = ctx->info.npu_num;
+  s->n = 1;
+  s->c = npu_num;
+  s->h = cvm_table_h();
+  s->w = cvm_table_w();  // hard code for hw, hw:32x8
+}
+
+void cvm_table_check(cvk_tl_t* IN tl_ifmap, cvk_tl_t* tbl_answer, cvk_tl_t* tbl_answer_mantissa,
+                     cvk_tl_t* OUT tl_ofmap_bf16) {
+  // MUST valid
+  ASSERT(tl_ofmap_bf16);
+  ASSERT(tl_ifmap);
+  ASSERT(tbl_answer);
+  ASSERT(tbl_answer_mantissa);
+
+  // shape should be same
+
+  // TODO table channel should be great equal input
+
+  // currently ONLY support bf16
+  ASSERT(tl_ifmap->fmt == CVK_FMT_BF16);
+  ASSERT(tbl_answer->fmt == CVK_FMT_BF16);
+  ASSERT(tbl_answer_mantissa->fmt == CVK_FMT_BF16);
+  ASSERT(tl_ofmap_bf16->fmt == CVK_FMT_BF16);
+
+  // table shape should fix
+  ASSERT(is_1880v2_tbl_shape(&tbl_answer->shape));
+  ASSERT(is_1880v2_tbl_shape(&tbl_answer_mantissa->shape));
+}
+
+static void _bf16_table_check(cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_buf, cvk_tl_t* tbl_answer,
+                              cvk_tl_t* tbl_answer_mantissa, cvk_tl_t* OUT tl_ofmap_bf16) {
+  // check table / input / output
+  cvm_table_check(tl_ifmap, tbl_answer, tbl_answer_mantissa, tl_ofmap_bf16);
+
+  // check buf
+  ASSERT(tl_buf);
+  ASSERT(tl_buf->fmt == CVK_FMT_BF16);
+
+  // TODO: remove ASSERT for -O2
+}
+
+int _cvm_lut_exp_mantissa(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_buf,
+                          cvk_tl_t* tbl_answer, cvk_tl_t* tbl_answer_mantissa,
+                          cvk_tl_t* OUT tl_ofmap_bf16, bool is_dirty_ifmap) {
+  cvk_tl_t* tmp = tl_buf;
+  if (is_dirty_ifmap) {
+    tmp = tl_ifmap;
+  }
+
+  // check table / input / output
+  _bf16_table_check(tl_ifmap, tl_ifmap, tbl_answer, tbl_answer_mantissa, tl_ofmap_bf16);
+
+  // issue lut cmd
+  cvk_tdma_l2l_tensor_copy_param_t p10;
+  // remove low 8 bits by int8 copy with stride
+  // <! get index(pow)
+  memset(&p10, 0x00, sizeof(cvk_tdma_l2l_tensor_copy_param_t));
+  p10.dst = tl_ofmap_bf16;
+  p10.src = tl_ifmap;
+  p10.mv_lut_base = false;  // MUST init by ifself in soc
+  p10.mv_lut_idx = true;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+  p10.mv_lut_idx = false;
+
+  // <! get f(x0) = 2^(x0*-0.5)
+  cvk_tiu_lookup_table_param_t p12;
+  p12.ofmap = tl_ofmap_bf16;
+  p12.ifmap = tl_ofmap_bf16;
+  p12.table = tbl_answer;
+  ctx->ops->tiu_lookup_table(ctx, &p12);
+
+  // <! get mantissa value
+  p12.ofmap = tmp;
+  p12.ifmap = tl_ifmap;
+  p12.table = tbl_answer_mantissa;
+  ctx->ops->tiu_lookup_table(ctx, &p12);
+
+  // sqrt = (2^exp) * mantissa
+  cvk_tiu_mul_param_t p1;
+  p1.res_high = NULL;
+  p1.res_low = tl_ofmap_bf16;
+  p1.a = tl_ofmap_bf16;
+  p1.b_is_const = 0;
+  p1.b = tmp;
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  return 0;
+}
+
+int cvm_lut_exp_mantissa(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_buf,
+                         cvk_tl_t* tbl_answer, cvk_tl_t* tbl_answer_mantissa,
+                         cvk_tl_t* OUT tl_ofmap_bf16) {
+  return _cvm_lut_exp_mantissa(ctx, tl_ifmap, tl_buf, tbl_answer, tbl_answer_mantissa,
+                               tl_ofmap_bf16, false);
+}
+
+// \int8_rnd_mode 1 is rounding to 0, e.g: 1.3->1, -1.3->-1, -1.5->-2
+//                0 is rounding to nearset even, e.g: 1.3->1, -1.3->-1, -1.7->-2
+// \return convert bf16 as int8 and locate to lower part
+// e.g.: 24 = 0x18 = 1.5* 2^4 = 0x41C0
+// cvm_get_tbl_idx(0x41C0,CVK_FMT_U8) = 0x0018
+void _cvm_get_tbl_idx(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16,
+                      cvk_fmt_t src_fmt, int int8_rnd_mode) {
+  ASSERT((int8_rnd_mode == 0 || int8_rnd_mode == 1) && "only support 2 mode");
+
+  ASSERT(tl_ifmap->fmt == CVK_FMT_BF16);
+  ASSERT(tl_ofmap_bf16->fmt == CVK_FMT_BF16);
+
+  // get index
+  cvk_tl_shape_t tl_ofmap_A_idx_int8_shape = {tl_ofmap_bf16->shape.n, tl_ofmap_bf16->shape.c,
+                                              tl_ofmap_bf16->shape.h * tl_ofmap_bf16->shape.w, 1};
+
+  cvk_tl_t dst;
+  bmk1880v2_tensor_lmem_s_copy(&dst, tl_ofmap_bf16);
+  dst.start_address = tl_ofmap_bf16->start_address;
+  dst.fmt = src_fmt;
+  dst.shape = tl_ofmap_A_idx_int8_shape;
+  dst.stride = ctx->ops->tl_default_stride(ctx, dst.shape, dst.fmt, CTRL_NULL);
+  dst.stride.h = dst.stride.h * 2;
+  dst.int8_rnd_mode = int8_rnd_mode;
+
+  cvk_tdma_l2l_tensor_copy_param_t p10;
+  p10.dst = &dst;
+  p10.src = tl_ifmap;
+  p10.mv_lut_base = false;  // MUST init by ifself in soc
+  p10.mv_lut_idx = false;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+}
+
+void cvm_get_uint8_t_tbl_idx(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16) {
+  _cvm_get_tbl_idx(ctx, tl_ifmap, tl_ofmap_bf16, CVK_FMT_U8, 0);
+}
+
+/*
+ * \brief get bf16 decimal part, cvm_get_dec(12.3) = 12.0
+ * it leverages bf16->int8 get integer and move to bf16
+ * \tl_ifmap should be CVK_FMT_BF16 format / size
+ */
+void cvm_get_dec(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf,
+                 cvk_tl_t* OUT tl_ofmap_bf16) {
+  ASSERT(tl_ifmap->fmt == CVK_FMT_BF16);
+  ASSERT(tl_ofmap_bf16->fmt == CVK_FMT_BF16);
+
+  cvk_tdma_l2l_tensor_copy_param_t p10;
+  cvk_tl_t dst, src;
+  bmk1880v2_tensor_lmem_s_copy(&src, tl_ifmap);
+  bmk1880v2_tensor_lmem_s_copy(&dst, tl_buf);
+
+  dst.fmt = CVK_FMT_I8;
+  dst.stride = ctx->ops->tl_default_stride(ctx, dst.shape, dst.fmt, CTRL_AL);
+
+  // bf16 -> int8
+  p10.dst = &dst;
+  p10.src = &src;
+  p10.mv_lut_base = false;  // MUST init by ifself in soc
+  p10.mv_lut_idx = false;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+
+  // int8 -> bf16
+  // src.fmt = CVK_FMT_I8;
+  // cvk_tl_shape_t tl_ofmap_A_idx_int8_shape = {tl_ofmap_bf16->shape.n, tl_ofmap_bf16->shape.c,
+  //  tl_ofmap_bf16->shape.h * tl_ofmap_bf16->shape.w, 1};
+  // src.shape = tl_ofmap_A_idx_int8_shape;
+  // src.stride = ctx->ops->tl_default_stride(ctx, src.shape, /*eu_align*/ 1,
+  // src.fmt); src.stride.w = 2;
+
+  // cvk_tl_shape_t tl_dst_reshape = {tl_ofmap_bf16->shape.n, tl_ofmap_bf16->shape.c,
+  //  1, tl_ofmap_bf16->shape.h * tl_ofmap_bf16->shape.w};
+
+  p10.dst = tl_ofmap_bf16;
+  p10.src = &dst;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+}
+
+// \return decimal fractions / mantissa_as_idx,
+// e.g: cvm_get_dec_fractions(12.341) = 0.341
+// NOTICE: we use bf16->i8, the decimal part should be -127 ~ +127
+void cvm_get_dec_fractions(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* OUT buf,
+                           cvk_tl_t* OUT tl_ofmap_bf16) {
+  ASSERT(tl_ifmap->fmt == CVK_FMT_BF16);
+  ASSERT(tl_ofmap_bf16->fmt == CVK_FMT_BF16);
+
+  // idx(i8) to bf16 format to sub it
+  cvm_get_dec(ctx, tl_ifmap, tl_ofmap_bf16, buf);
+
+  // mantissa part -> s.b.b to get mantissa
+  cvk_tiu_sub_param_t p5;
+  p5.res_high = 0;
+  p5.res_low = tl_ofmap_bf16;
+  p5.a_high = 0;
+  p5.a_low = tl_ifmap;
+  p5.b_high = 0;
+  p5.b_low = buf;
+  p5.rshift_bits = 0;
+
+  ctx->ops->tiu_sub(ctx, &p5);
+}
+
+/**
+ * \table_shape return table shape under 1880v2 BF16
+ * \return table byte size under BF16
+ */
+uint64_t cvm_lut_tbl_bytesize(cvk_context_t* ctx, cvk_tl_shape_t* table_shape, cvk_fmt_t fmt) {
+  ASSERT(table_shape);
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  cvm_table_shape(ctx, table_shape);
+  uint64_t table_size = tl_shape_size(table_shape);
+
+  return table_size * data_type_size;
+}
+
+/**
+ * \brief f(x) = x*x
+ */
+int cvm_emit_square(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16,
+                    cvk_fmt_t fmt) {
+  (void)fmt;
+  ASSERT(tl_ifmap->fmt == CVK_FMT_BF16);
+  ASSERT(tl_ofmap_bf16->fmt == CVK_FMT_BF16);
+
+  cvk_tiu_mul_param_t p1;
+  p1.res_high = NULL;
+  p1.res_low = tl_ofmap_bf16;
+  p1.a = tl_ifmap;
+  p1.b_is_const = 0;
+  p1.b = tl_ifmap;
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  return 0;
+}
+
+/**
+ * \brief f(x) = |x|
+ * TODO: check tl_ifmap->start_addr != tl_ofmap_bf16->start_addr
+ */
+int cvm_emit_abs(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16,
+                 cvk_fmt_t fmt) {
+  (void)fmt;
+  ASSERT(tl_ifmap->fmt == tl_ofmap_bf16->fmt && "fmt should be equal");
+
+  uint32_t b_const = -1;
+  if (tl_ifmap->fmt) {
+    b_const = convert_fp32_bf16(-1.0);
+  }
+
+  // abs it, multiply -1
+  cvk_tiu_mul_param_t p1;
+  p1.res_high = NULL;
+  p1.res_low = tl_ofmap_bf16;
+  p1.a = tl_ifmap;
+  p1.b_is_const = 1;
+  p1.b_const.val = b_const;
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  // abs it, get max
+  cvk_tiu_max_param_t p;
+  p.max = tl_ofmap_bf16;
+  p.a = tl_ofmap_bf16;
+  p.b_is_const = 0;
+  p.b = tl_ifmap;
+
+  ctx->ops->tiu_max(ctx, &p);
+
+  return 0;
+}
+
+/**
+ * \brief pythagoras p(x, y) = pow(x*x + y*y, 0.5)
+ * plz refer [here](http://www.themathpage.com/Alg/pythagorean-distance.htm)
+ */
+int cvm_emit_pythagoras(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf,
+                        cvk_tl_t* tl_buf2, cvk_tl_t* tl_sqrt_table_answer,
+                        cvk_tl_t* tl_sqrt_table_answer_mantissa, cvk_tl_t* OUT tl_ofmap_bf16,
+                        cvk_fmt_t fmt) {
+  // y0 = x * x
+  cvm_emit_square(ctx, x, tl_buf, fmt);
+
+#if 1
+  // y0 = y0 + y * y
+  cvk_tiu_mac_param_t p2;
+  p2.res_high = 0;
+  p2.res_low = tl_buf;
+  p2.res_is_int8 = 0;
+  p2.a = y;
+  p2.b_is_const = 0;
+  p2.b = y;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+
+  ctx->ops->tiu_mac(ctx, &p2);
+#else
+  // y * y
+  cvm_emit_square(ctx, y, tl_buf2, fmt);
+  // y = x + y
+  {
+    cvk_tiu_add_param_t p4;
+    p4.res_high = 0;
+    p4.res_low = tl_buf;
+    p4.a_high = 0;
+    p4.a_low = tl_buf2;
+    p4.b_is_const = 0;
+    p4.b.high = 0;
+    p4.b.low = tl_buf;
+    p4.rshift_bits = 0;
+    p4.relu_enable = 0;
+
+    ctx->ops->tiu_add(ctx, &p4);
+  }
+#endif
+
+  // y0 = sqrt(y0)
+  cvm_emit_sqrt(ctx, tl_buf, tl_buf2, tl_sqrt_table_answer, tl_sqrt_table_answer_mantissa,
+                tl_ofmap_bf16);
+  return 0;
+}
+
+void cvm_gen_0_tbl(uint16_t* OUT table_0, cvk_tl_shape_t* table_shape) {
+  ASSERT(is_1880v2_tbl_shape(table_shape));
+
+  uint32_t half = half_h_table();
+  int table_hw = cvm_table_hw();
+
+  table_0[0] = convert_fp32_bf16(1.0);
+
+  for (uint32_t i = 1; i < half * 2; i++) {
+    table_0[i] = convert_fp32_bf16(0.0);
+  }
+
+#ifdef DBG
+  for (uint32_t i = 0; i < 2 * half; i++) {
+    printf("lut [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_0[i]), table_0[i]);
+  }
+#endif /* ifdef DBG */
+
+  // duplicate channel #1 to #31
+  // TODO: tensor copy
+  for (uint64_t i = 1; i < table_shape->c; i++) {
+    memcpy(&table_0[table_hw * i], &table_0[0], sizeof(uint16_t) * table_hw);
+  }
+}
+
+/**
+ * \brief check which element is 0, return 1 others return 0
+ * e.g: input = [0, 1, -1, 2] output [1, 0, 0, 0]
+ */
+int cvm_emit_0_idx(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf,
+                   cvk_tl_t* tbl_answer, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  // check table / input / output
+  _bf16_table_check(tl_ifmap, tl_buf, tbl_answer, tbl_answer, tl_ofmap_bf16);
+
+  ASSERT(fmt);
+
+  // TODO: add fmt parameter?
+  // abs for \bf16_get_uint8_t_tbl_idx we use bf16->uint8_t
+  // cvm_emit_abs(ctx, tl_ifmap, tl_ofmap_bf16, CVK_FMT_BF16);
+  // TODO check if address == of address
+  // cvm_get_uint8_t_tbl_idx(ctx, tl_ofmap_bf16, tl_buf);
+  // re-scale 0.xx to x.
+  // cvm_emit_mul_const(ctx, tl_ifmap, tl_buf, fmt, 1000);
+
+  // we directly use mantissa as index, try to add mantissa and mul to filter 2's power
+  // cvm_emit_abs(ctx, tl_ifmap, tl_buf, fmt);
+  // cvm_emit_add_const(ctx, tl_buf, tl_buf, fmt, convert_bf16_fp32(0x7f) + 1);
+  // cvm_emit_mul(ctx, tl_ifmap, tl_buf, tl_buf, fmt);
+
+  cvk_tiu_lookup_table_param_t p12;
+#if 1
+  // NOTICE: we use int8 mul to enlarge 2^n
+  cvk_tl_t src, dst;
+  bmk1880v2_tensor_lmem_s_copy(&src, tl_ifmap);
+  bmk1880v2_tensor_lmem_s_copy(&dst, tl_buf);
+
+  src.fmt = CVK_FMT_U8;
+  src.shape.w = src.shape.w * 2;  // real size
+  src.stride = ctx->ops->tl_default_stride(ctx, src.shape, src.fmt, CTRL_NULL);
+  dst.shape = src.shape;
+  dst.fmt = src.fmt;
+  dst.stride = src.stride;
+
+  cvk_tiu_mul_param_t p;
+  p.res_high = NULL;
+  p.res_low = &dst;
+  p.a = &src;
+  p.b_is_const = 1;
+  p.b_const.val = 255;  // saturate
+  p.b_const.is_signed = 0;
+  p.rshift_bits = 2;  // avoid unnormal
+  p.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p);
+
+  // get 2^x and 0
+  p12.ofmap = tl_buf;
+  p12.ifmap = tl_buf;
+  p12.table = tbl_answer;
+  ctx->ops->tiu_lookup_table(ctx, &p12);
+
+  // cvm_get_uint8_t_tbl_idx(ctx, tl_buf, tl_ofmap_bf16);
+  _cvm_get_tbl_idx(ctx, tl_ifmap, tl_ofmap_bf16, CVK_FMT_I8, 0);
+
+  // get 0<x<1 and 0
+  // directly consider mantissa as index
+  p12.ofmap = tl_ofmap_bf16;
+  p12.ifmap = tl_ofmap_bf16;
+  p12.table = tbl_answer;
+  ctx->ops->tiu_lookup_table(ctx, &p12);
+
+  cvm_emit_mul(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+#else
+
+  _cvm_get_tbl_idx(ctx, tl_ifmap, tl_ofmap_bf16, CVK_FMT_I8, 1);
+  p12.ofmap = tl_ofmap_bf16;
+  p12.ifmap = tl_ofmap_bf16;
+  p12.table = tbl_answer;
+  ctx->ops->tiu_lookup_table(ctx, &p12);
+#endif
+  return 0;
+}
+
+/**
+ * \brief max(x, const)
+ * e.g.: x = [1, 2, 3, 4, -1, -2], const = 1 y = [1, 2, 3, 1, 1]
+ */
+int cvm_emit_max_const(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16,
+                       cvk_fmt_t fmt, float b) {
+  (void)fmt;
+  cvk_tiu_max_param_t p;
+  p.max = tl_ofmap_bf16;
+  p.a = tl_ifmap;
+  p.b_is_const = 1;
+  p.b_const.val = convert_fp32_bf16(b);
+
+  ctx->ops->tiu_max(ctx, &p);
+
+  return 0;
+}
+
+/**
+ * \brief min(x, const)
+ * e.g.: x = [1, 2, 3, 4, -1, -2], const = 1 y = [1, 1, 1, -1, -2]
+ */
+int cvm_emit_min_const(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16,
+                       cvk_fmt_t fmt, float b) {
+  (void)fmt;
+  cvk_tiu_min_param_t p7;
+  p7.min = tl_ofmap_bf16;
+  p7.a = tl_ifmap;
+  p7.b_is_const = 1;
+  p7.b_const.val = convert_fp32_bf16(b);
+  p7.b_const.is_signed = 1;
+
+  ctx->ops->tiu_min(ctx, &p7);
+
+  return 0;
+}
+
+// pre process pos/neg
+static int _cvm_emit_pre_pos_neg(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf,
+                                 cvk_tl_t* tl_pos_neg_table, cvk_tl_t* OUT tl_ofmap_bf16) {
+  // check table / input / output
+  _bf16_table_check(tl_ifmap, tl_buf, tl_pos_neg_table, tl_pos_neg_table, tl_ofmap_bf16);
+
+  //_cvm_get_tbl_idx(ctx, tl_ifmap, tl_buf, CVK_FMT_I8);
+
+  // seperate >=0 and < 0
+  // p = table_pos_neg(x0) (>= 0 return 1, < 0 return -1)
+  cvk_tdma_l2l_tensor_copy_param_t p10;
+  p10.dst = tl_buf;
+  p10.src = tl_ifmap;
+  p10.mv_lut_base = false;  // MUST init by ifself in soc
+  p10.mv_lut_idx = true;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+  p10.mv_lut_idx = false;
+
+  cvk_tiu_lookup_table_param_t p12;
+  p12.ofmap = tl_buf;
+  p12.ifmap = tl_buf;
+  p12.table = tl_pos_neg_table;
+  ctx->ops->tiu_lookup_table(ctx, &p12);
+
+  return 0;
+}
+
+/**
+ * \brief check elements are < 0
+ * \tl_pos_neg_table plz refer \bf16_atan_pos_neg
+ * e.g: input = [0, 10, 6, -1, 0] output [0, 0, 0, 1, 0]
+ */
+int cvm_emit_neg_idx(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf,
+                     cvk_tl_t* tl_pos_neg_table, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  _cvm_emit_pre_pos_neg(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_ofmap_bf16);
+
+  // sub 1, [1 -1] -> [0 -2]
+  cvm_emit_add_const(ctx, tl_buf, tl_buf, fmt, -1.0);
+
+  // abs, [0 -2] -> [0 2]
+  cvm_emit_abs(ctx, tl_buf, tl_ofmap_bf16, fmt);
+
+  // mul 1/2 [0 2] -> [0 1]
+  cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 0.5);
+
+  return 0;
+}
+
+/**
+ * \brief check elements are >= 0
+ * \tl_pos_neg_table plz refer \bf16_atan_pos_neg
+ * e.g: input = [0, 10, 6, -1, 0] output [0, 1, 1, 0, 0]
+ */
+int cvm_emit_pos_idx(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf,
+                     cvk_tl_t* tl_pos_neg_table, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  _cvm_emit_pre_pos_neg(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_ofmap_bf16);
+
+  // add 1, [1 -1] -> [2 0]
+  cvm_emit_add_const(ctx, tl_buf, tl_buf, fmt, 1.0);
+
+  // mul 1/2 [2 0] -> [1 0]
+  cvm_emit_mul_const(ctx, tl_buf, tl_ofmap_bf16, fmt, 0.5);
+
+  return 0;
+}
+
+/**
+ * \brief invert 0/1 input
+ * e.g: input = [0, 1, 1, 1, 0] output [1, 0, 0, 0, 1]
+ */
+int _cvm_emit_0_1_revert_input(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf,
+                               cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, bool is_dirty_ifmap) {
+  // [-1, -1, 0, -1, 0] = sub([0 0 1 0 1], 1)
+  // [1, 1, 0, 1, 0] = abs([-1, -1, 0, -1, 0])
+  cvk_tl_t* _tl_buf = tl_buf;
+
+  // check buf
+  if (is_dirty_ifmap) {
+    _tl_buf = tl_ifmap;
+  } else {
+    ASSERT(tl_buf);
+    ASSERT(tl_buf->fmt == CVK_FMT_BF16);
+  }
+
+  // sub 1,  = add -1
+  cvm_emit_add_const(ctx, tl_ifmap, _tl_buf, fmt, -1.0);
+
+  // abs
+  cvm_emit_abs(ctx, _tl_buf, tl_ofmap_bf16, fmt);
+
+  return 0;
+}
+
+int cvm_emit_0_1_revert_input(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf,
+                              cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  return _cvm_emit_0_1_revert_input(ctx, tl_ifmap, tl_buf, tl_ofmap_bf16, fmt, false);
+}
+/**
+ * \brief invert 0/1 value
+ * e.g: input = [0, 10, 6, -1, 0] output [1, 0, 0, 0, 1]
+ * the step is [0, 10, 6, -1, 0] -> [0, 1, 1, 1, 0] -> [1, 0, 0, 0, 1]
+ */
+int cvm_emit_0_1_revert(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf,
+                        cvk_tl_t* tbl_answer, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  // [-1, -1, 0, -1, 0] = sub([0 0 1 0 1], 1)
+  // [1, 1, 0, 1, 0] = abs([-1, -1, 0, -1, 0])
+
+  // check table / input / output
+  _bf16_table_check(tl_ifmap, tl_buf, tbl_answer, tbl_answer, tl_ofmap_bf16);
+
+  // check which element is 0, return 1 others return 0
+  cvm_emit_0_idx(ctx, tl_ifmap, tl_buf, tbl_answer, tl_ofmap_bf16, fmt);
+
+  cvm_emit_0_1_revert_input(ctx, tl_ofmap_bf16, tl_buf, tl_ofmap_bf16, fmt);
+
+  return 0;
+}
+
+// \brief a(tensor) * b(tensor)
+int cvm_emit_mul(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_ifmap2,
+                 cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  (void)fmt;
+  cvk_tiu_mul_param_t p1;
+  p1.res_high = NULL;
+  p1.res_low = tl_ofmap_bf16;
+  p1.a = tl_ifmap;
+  p1.b_is_const = 0;
+  p1.b = tl_ifmap2;
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  return 0;
+}
+
+// \brief a(tensor) * b(tensor)
+int cvm_emit_add(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_ifmap2,
+                 cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  (void)fmt;
+  cvk_tiu_add_param_t p4;
+  p4.res_high = 0;
+  p4.res_low = tl_ofmap_bf16;
+  p4.a_high = 0;
+  p4.a_low = tl_ifmap;
+  p4.b_is_const = 0;
+  p4.b.high = 0;
+  p4.b.low = tl_ifmap2;
+  p4.rshift_bits = 0;
+  p4.relu_enable = 0;
+
+  ctx->ops->tiu_add(ctx, &p4);
+
+  return 0;
+}
+
+int cvm_emit_add_const(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16,
+                       cvk_fmt_t fmt, float b) {
+  (void)fmt;
+  cvk_tiu_add_param_t p4;
+  p4.res_high = 0;
+  p4.res_low = tl_ofmap_bf16;
+  p4.a_high = 0;
+  p4.a_low = tl_ifmap;
+  p4.b_is_const = 1;
+  p4.b.high = 0;
+  p4.b_const.val = convert_fp32_bf16(b);
+  p4.rshift_bits = 0;
+  p4.relu_enable = 0;
+
+  ctx->ops->tiu_add(ctx, &p4);
+
+  return 0;
+}
+
+// \brief a(tensor) * b(const)
+int cvm_emit_mul_const(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16,
+                       cvk_fmt_t fmt, float b) {
+  (void)fmt;
+  uint32_t b_const = (int)b;
+  if (fmt == CVK_FMT_BF16) {
+    b_const = convert_fp32_bf16(b);
+  }
+
+  cvk_tiu_mul_param_t p1;
+  p1.res_high = NULL;
+  p1.res_low = tl_ofmap_bf16;
+  p1.a = tl_ifmap;
+  p1.b_is_const = 1;
+  p1.b_const.val = b_const;
+  p1.b_const.is_signed = 1;
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  return 0;
+}
+
+// \brief a(tensor) / b(const)
+// NOTICE: it could dirty \y if \is_dirty_ifmap set true
+int cvm_emit_x_over_y(cvk_context_t* ctx, cvk_tl_t* IN x, cvk_tl_t* IN y, cvk_tl_t* IN tl_buf,
+                      cvk_tl_t* OUT tl_ofmap_bf16, cvk_tl_t* tl_table_answer,
+                      cvk_tl_t* tl_table_answer_mantissa, cvk_fmt_t fmt, bool is_dirty_ifmap) {
+  cvk_tl_t* tmp = tl_buf;
+  if (is_dirty_ifmap) {
+    tmp = NULL;
+  }
+
+  // y = reciprocal(y)
+  _cvm_lut_exp_mantissa(ctx, y, tmp, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16,
+                        is_dirty_ifmap);
+
+  // x / y = x * (1/y)
+  cvm_emit_mul(ctx, x, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+
+  return 0;
+}
+
+int _cvm_emit_mask(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2,
+                   cvk_tl_t* tl_buf3, cvk_tl_t* tl_pos_neg_table, cvk_tl_t* tl_0_idx_table,
+                   cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, enum CVM_MASK_TYPE mask,
+                   bool is_dirty_ifmap) {
+  _bf16_table_check(tl_ifmap, tl_buf, tl_pos_neg_table, tl_0_idx_table, tl_ofmap_bf16);
+  if (is_dirty_ifmap) {
+  } else {
+  }
+
+  switch (mask) {
+    case CVM_MASK_TYPE_GT_0:
+      // x > 0
+      {
+        // x >= 0
+        cvm_emit_pos_idx(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_buf2, fmt);
+
+        cvk_tl_t* out = tl_ofmap_bf16;
+        cvk_tl_t* in = tl_ofmap_bf16;
+        if (is_dirty_ifmap) {
+          // x = 0
+          cvm_emit_0_idx(ctx, tl_ifmap, tl_buf, tl_0_idx_table, tl_ofmap_bf16,
+                         fmt);  // 0.003 could consider 1
+          // !(x = 0)
+          _cvm_emit_0_1_revert_input(ctx, tl_ofmap_bf16, NULL, tl_buf, fmt, true);
+          in = tl_buf;
+          out = tl_ofmap_bf16;
+        } else {
+          // x = 0
+          cvm_emit_0_idx(ctx, tl_ifmap, tl_buf, tl_0_idx_table, tl_buf3,
+                         fmt);  // 0.003 could consider 1
+          // !(x = 0)
+          cvm_emit_0_1_revert_input(ctx, tl_buf3, tl_buf, tl_ofmap_bf16, fmt);
+        }
+
+        // x > 0 = (x >= 0 && !(x = 0))
+        cvm_emit_mul(ctx, in, tl_buf2, out, fmt);
+      }
+      break;
+    case CVM_MASK_TYPE_GE_0:
+      // y >= 0
+
+      cvm_emit_pos_idx(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_ofmap_bf16, fmt);
+      break;
+    case CVM_MASK_TYPE_EQ_0:
+      cvm_emit_0_idx(ctx, tl_ifmap, tl_buf, tl_0_idx_table, tl_ofmap_bf16,
+                     fmt);  // 0.003 could consider 1
+      break;
+    case CVM_MASK_TYPE_LT_0:
+      // x < 0
+
+      // x < 0
+      cvm_emit_neg_idx(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_ofmap_bf16, fmt);
+
+      break;
+    case CVM_MASK_TYPE_LE_0:
+      // x < 0
+      cvm_emit_neg_idx(ctx, tl_ifmap, tl_buf, tl_pos_neg_table, tl_ofmap_bf16, fmt);
+
+      // x = 0
+      cvm_emit_0_idx(ctx, tl_ifmap, tl_buf, tl_0_idx_table, tl_buf2,
+                     fmt);  // 0.003 could consider 1
+
+      // x <= 0 = (x < 0 || (x = 0))
+      cvm_emit_add(ctx, tl_ofmap_bf16, tl_buf2, tl_ofmap_bf16, fmt);
+      break;
+    default:
+      ASSERT(0 && "not support yet");
+  }
+  return 0;
+}
+
+/**
+ * \brief return > 0 mask
+ * e.g.: [1 2 -1 0 -3 -4 ] -> [1 1 0 0 0 0]
+ */
+int cvm_emit_mask_gt0(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf,
+                      cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_pos_neg_table,
+                      cvk_tl_t* tl_0_idx_table, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  return cvm_emit_mask(ctx, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_pos_neg_table, tl_0_idx_table,
+                       tl_ofmap_bf16, fmt, CVM_MASK_TYPE_GT_0);
+}
+
+/**
+ * \brief return >= 0 mask
+ * e.g.: [1 2 -1 0 -3 -4 ] -> [1 1 0 1 0 0]
+ */
+int cvm_emit_mask_ge0(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf,
+                      cvk_tl_t* tl_pos_neg_table, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  return cvm_emit_mask(ctx, tl_ifmap, tl_buf,
+                       tl_buf,  // fake
+                       tl_buf,  // fake
+                       tl_pos_neg_table,
+                       tl_pos_neg_table,  // fake
+                       tl_ofmap_bf16, fmt, CVM_MASK_TYPE_GE_0);
+}
+
+/**
+ * \brief return <= 0 mask
+ * e.g.: [1 2 -1 0 -3 -4 ] -> [0 0 1 1 0 0]
+ */
+int cvm_emit_mask_le0(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf,
+                      cvk_tl_t* tl_pos_neg_table, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  return cvm_emit_mask(ctx, tl_ifmap, tl_buf,
+                       tl_buf,  // fake
+                       tl_buf,  // fake
+                       tl_pos_neg_table,
+                       tl_pos_neg_table,  // fake
+                       tl_ofmap_bf16, fmt, CVM_MASK_TYPE_LE_0);
+}
+
+/**
+ * \brief return = 0 mask
+ * e.g.: [1 2 -1 0 -3 -4 ] -> [0 0 0 1 0 0]
+ */
+int cvm_emit_mask_eq0(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf,
+                      cvk_tl_t* tl_0_idx_table, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  return cvm_emit_mask(ctx, tl_ifmap, tl_buf,
+                       tl_buf,          // fake
+                       tl_buf,          // fake
+                       tl_0_idx_table,  // fake
+                       tl_0_idx_table, tl_ofmap_bf16, fmt, CVM_MASK_TYPE_EQ_0);
+}
+
+/**
+ * \brief return < 0 mask
+ * e.g.: [1 2 -1 0 -3 -4 ] -> [0 0 1 0 1 1]
+ */
+int cvm_emit_mask_lt0(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf,
+                      cvk_tl_t* tl_pos_neg_table, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  return cvm_emit_mask(ctx, tl_ifmap, tl_buf,
+                       tl_buf,  // fake
+                       tl_buf,  // fake
+                       tl_pos_neg_table,
+                       tl_pos_neg_table,  // fake
+                       tl_ofmap_bf16, fmt, CVM_MASK_TYPE_LT_0);
+}
+
+int cvm_emit_mask(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2,
+                  cvk_tl_t* tl_buf3, cvk_tl_t* tl_pos_neg_table, cvk_tl_t* tl_0_idx_table,
+                  cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, enum CVM_MASK_TYPE mask) {
+  return _cvm_emit_mask(ctx, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_pos_neg_table, tl_0_idx_table,
+                        tl_ofmap_bf16, fmt, mask, false);
+}
+
+// return x >=0 to 1, x < 0 is -1
+void cvm_emit_mask_ge0_lt0(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* index_i8,
+                           cvk_tl_t* OUT tl_buf3, cvk_fmt_t fmt) {
+  cvk_tiu_mul_param_t p;
+  cvk_tdma_l2l_tensor_copy_param_t p1;
+
+  // get y < 0, bf16->int8 and mul 0xff to get -128 and righshift to get 1
+  cvm_emit_mul_const(ctx, y, tl_buf3, fmt, pow(2, 64));
+  p1.src = tl_buf3;
+  p1.dst = index_i8;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+
+  p.res_high = 0;
+  p.res_low = index_i8;
+  p.a = index_i8;
+  p.b_is_const = 1;
+  p.b_const.val = -128;
+  p.b_const.is_signed = 1;
+  p.rshift_bits = 0;
+  p.relu_enable = 1;
+
+  ctx->ops->tiu_mul(ctx, &p);
+
+  p.res_high = 0;
+  p.res_low = index_i8;
+  p.a = index_i8;
+  p.b_is_const = 1;
+  p.b_const.val = 1;
+  p.b_const.is_signed = 1;
+  p.rshift_bits = 7;
+  p.relu_enable = 1;
+
+  ctx->ops->tiu_mul(ctx, &p);
+
+  // get y < 0 indicate 1
+  p1.src = index_i8;
+  p1.dst = tl_buf3;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+
+  // merge, y >= 0 is 1, y < 0 is -1
+  cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -2.0);
+  cvm_emit_add_const(ctx, tl_buf3, tl_buf3, fmt, 1.0);
+
+#if 0
+  cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -1.0);
+
+  // get y > 0
+  // y * (-1) + 1 get 0/1 map, 1 indicate xy == 0
+  cvm_emit_add_const(ctx, tl_buf3, tl_buf2, fmt, 1.0);
+
+  // reduce y == 0
+  if (0)
+  {
+    cvk_tiu_max_param_t p3;
+    cvk_tl_t index_i8;
+    bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, index_i8, tl_ofmap_bf16, CVK_FMT_I8);
+    cvm_emit_mul_const(ctx, y, tl_buf, fmt, -1);
+    p3.max = tl_buf;
+    p3.a = y;
+    p3.b_is_const = 0;
+    p3.b =  tl_buf;
+
+    ctx->ops->tiu_max(ctx, &p3);
+    cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, convert_bf16_fp32(0x7f00));
+    //bf16_emit_mul_const(ctx, tl_buf, tl_buf, fmt, pow(2, 64));
+
+    p1.src = tl_buf;
+    p1.dst = index_i8;
+    ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+
+    p.res_high = NULL;
+    p.res_low = index_i8;
+    p.a = index_i8;
+    p.b_is_const = 1;
+    p.b_const.val =-1;
+    p.b_const.is_signed = 1;
+    p.rshift_bits = 7;
+    p.relu_enable = 0;
+
+    ctx->ops->tiu_mul(ctx, &p);
+
+
+    p1.src = index_i8;
+    p1.dst = tl_buf3;
+    ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+
+    //revert it
+    cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -1.0);
+    //bf16_emit_add_const(ctx, tl_buf3, tl_buf3, fmt, 1);
+    cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
+  }
+
+  cvm_emit_add(ctx, tl_buf2, tl_buf3, tl_buf3, fmt);
+#endif
+}
+
+/*
+ * \return -1 means others, 0 indicate 0
+ */
+void cvm_emit_mask_eq_0(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* tl_buf, cvk_tl_t* index_i8,
+                        cvk_tl_t* OUT tl_buf3, cvk_fmt_t fmt) {
+  cvk_tdma_l2l_tensor_copy_param_t p1;
+  cvk_tiu_mul_param_t p;
+
+  cvm_emit_abs(ctx, y, tl_buf, fmt);
+  // cvm_emit_mul_const(ctx, y, tl_buf, fmt, -1);
+  // cvk_tiu_max_param_t p3;
+  // p3.max = tl_buf;
+  // p3.a = y;
+  // p3.b_is_const = 0;
+  // p3.b =  tl_buf;
+
+  // ctx->ops->tiu_max(ctx, &p3);
+  cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, convert_bf16_fp32(0x7f00));
+
+  p1.src = tl_buf;
+  p1.dst = index_i8;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+
+  p.res_high = NULL;
+  p.res_low = index_i8;
+  p.a = index_i8;
+  p.b_is_const = 1;
+  p.b_const.val = -1;
+  p.b_const.is_signed = 1;
+  p.rshift_bits = 7;
+  p.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p);
+
+  p1.src = index_i8;
+  p1.dst = tl_buf3;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+}
+
+int cvm_bytesize_of_fmt(cvk_fmt_t fmt) { return bytesize_of_fmt(fmt); }
+
+// dirty itself
+int cvm_reduce_hw_mul(cvk_context_t* cvk_ctx, cvk_tl_t* mp_tl_mulsum) {
+  cvk_tl_shape_t m_tl_mulsum_shape = mp_tl_mulsum->shape;
+  uint32_t total_data_size = m_tl_mulsum_shape.h * m_tl_mulsum_shape.w;
+  uint32_t data_size = total_data_size;
+  uint32_t fmt_size = cvm_bytesize_of_fmt(mp_tl_mulsum->fmt);
+  cvk_tiu_mul_param_t p_mul;
+  cvk_tl_t tl_1;
+  cvk_tl_t tl_2;
+  tl_1.fmt = mp_tl_mulsum->fmt;
+  tl_2.fmt = mp_tl_mulsum->fmt;
+  while (data_size > 1) {
+    uint32_t start_addr = mp_tl_mulsum->start_address;
+    bool add_1 = false;
+    if (data_size % 2 != 0) {
+      add_1 = true;
+      data_size -= 1;
+      start_addr += fmt_size;
+    }
+    data_size /= 2;
+    uint32_t w = data_size;
+    uint32_t h = 1;
+    size_t m = w / 2;
+    for (size_t i = 2; i < m; i++) {
+      if (data_size % i == 0) {
+        w = data_size / i;
+        h = i;
+        if (w < 4063) {
+          break;
+        }
+      }
+    }
+    tl_1.start_address = start_addr;
+    tl_2.start_address = start_addr + (h * w * fmt_size);
+    tl_1.shape.n = 1;
+    tl_1.shape.c = m_tl_mulsum_shape.c;
+    tl_1.shape.h = h;
+    tl_1.shape.w = w;
+    tl_1.stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_1.shape, tl_1.fmt, 1);
+    tl_2.shape = tl_1.shape;
+    tl_2.stride = tl_1.stride;
+    p_mul.a = &tl_1;
+    p_mul.b = &tl_2;
+    p_mul.res_low = &tl_1;
+    p_mul.res_high = NULL;
+    p_mul.b_is_const = 0;
+    p_mul.rshift_bits = 0;
+    p_mul.relu_enable = 0;
+    cvk_ctx->ops->tiu_mul(cvk_ctx, &p_mul);
+    if (add_1) {
+      data_size += 1;
+    }
+  }
+  return 0;
+}
diff --git a/cvimath/src/fp32_bf16_kernel.c b/cvimath/src/fp32_bf16_kernel.c
new file mode 100644
index 000000000..38e305336
--- /dev/null
+++ b/cvimath/src/fp32_bf16_kernel.c
@@ -0,0 +1,138 @@
+#include <cvimath_internal.h>
+#include "gen_lut.h"
+
+// only fill base_reg_index/int8_rnd_mode
+static void init_tgmem(cvk_tg_t* t) {
+  t->base_reg_index = 0;
+  t->int8_rnd_mode = 0;
+}
+
+int cvm_s2s_fp32_bf16(cvk_context_t* ctx, uint64_t gaddr_fp32, cvk_tg_shape_t fp32_shape,
+                      uint64_t gaddr_bf16, cvk_tg_shape_t cvm_shape, cvk_fmt_t fmt) {
+  int ret = 0;
+  ASSERT(fmt == CVK_FMT_BF16 && "only support CVK_FMT_BF16");
+  ASSERT(fp32_shape.w % 2 == 0 && "fp32's w MUST align with 2");
+
+  cvk_tdma_g2g_tensor_copy_param_t p;
+
+  cvk_tg_t src, dst;
+
+  init_tgmem(&src);
+  init_tgmem(&dst);
+
+  int fp32_w = 2;
+  src.fmt = fmt;
+  src.start_address = gaddr_fp32 + fp32_w;  // copy from high part
+  src.shape = fp32_shape;
+  src.shape.h = fp32_shape.w * fp32_shape.h / fp32_w;
+  src.shape.w = 1;
+
+  int fmt_sz = bytesize_of_fmt(fmt);
+  src.stride.n = fp32_shape.w * fp32_shape.h * fp32_shape.c * fmt_sz;
+  src.stride.c = fp32_shape.w * fp32_shape.h * fmt_sz;
+  src.stride.h = fp32_w * fmt_sz;
+
+  dst.fmt = fmt;
+  dst.start_address = gaddr_bf16;
+  dst.shape = cvm_shape;
+  dst.shape.h = cvm_shape.w * cvm_shape.h / fp32_w;
+  dst.shape.w = 1;
+  dst.stride = ctx->ops->tg_default_stride(ctx, dst.shape, dst.fmt);
+
+  p.src = &src;
+  p.dst = &dst;
+
+  ctx->ops->tdma_g2g_bf16_tensor_copy(ctx, &p);
+
+  return ret;
+}
+
+// default implement by s->s
+void cvm_bf16_fp32(cvk_context_t* cvk_ctx, cvk_tg_t* tg_bf16, cvk_tg_t* tg_fp32) {
+#if 0
+    // sys->local->sys implement
+  cvk_fmt_t fmt = tg_bf16->fmt;
+  cvk_tl_shape_t tl_shape;
+  int ctrl = CTRL_AL; // eu align
+
+  tl_shape.n = tg_fp32->shape.n;
+  tl_shape.c = tg_fp32->shape.c;
+  tl_shape.h = tg_fp32->shape.h;
+  tl_shape.w = tg_fp32->shape.w;
+
+  // 1. fill local memory to 0 for mantissa
+  cvk_tl_t *tl_ofmap = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, tl_shape, tg_bf16->fmt, ctrl);
+  cvk_tiu_mul_param_t p0;
+  p0.res_high = NULL;
+  p0.res_low = tl_ofmap;
+  p0.a = tl_ofmap;
+  p0.b_is_const = 1;
+  p0.b_const.val = 0;
+  p0.b_const.is_signed = 0;
+  p0.rshift_bits = 0;
+  p0.relu_enable = 0;
+  p0.layer_id = 0;
+
+  cvk_ctx->ops->tiu_mul(cvk_ctx, &p0);
+
+
+  // pretend the same shape, reshape h, w to h * w, 1
+  int fmt_bytesize = cvm_bytesize_of_fmt(tl_ofmap->fmt);
+  tl_ofmap->shape.w = 1;
+  tl_ofmap->shape.h = tg_bf16->shape.h * tg_bf16->shape.w;
+  tl_ofmap->stride.h = 4;
+  tl_ofmap->stride.c = align_up(tg_fp32->shape.w * tg_fp32->shape.h * fmt_bytesize,
+      cvk_ctx->info.eu_num);
+  tl_ofmap->stride.n = tl_ofmap->stride.c * ceiling_func(tg_fp32->shape.c,
+      cvk_ctx->info.npu_num);
+
+
+  // 2. load from tg with reshaped w
+  // FIXME: check overwrite
+  tl_ofmap->start_address = tl_ofmap->start_address + 2;// 2 means shift fp32 high 16 part
+  cvk_tdma_g2l_tensor_copy_param_t p;
+  p.src = tg_bf16;
+  p.dst = tl_ofmap;
+  cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &p);
+
+  // 3. store back to tg
+  tl_ofmap->start_address = tl_ofmap->start_address - 2; //revert
+  tl_ofmap->shape = tl_shape;
+  tl_ofmap->stride = cvk_ctx->ops->tl_default_stride(cvk_ctx, tl_ofmap->shape, fmt, ctrl);
+
+  cvk_tdma_l2g_tensor_copy_param_t p1;
+  p1.src = tl_ofmap;
+  p1.dst = tg_fp32;
+  cvk_ctx->ops->tdma_l2g_bf16_tensor_copy(cvk_ctx, &p1);
+
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, tl_ofmap);
+#else
+  // sys->sys implement
+  // 1. fill tg with low 16but part as 0
+  cvk_tdma_l2g_tensor_fill_constant_param_t p0;
+  p0.constant = 0;
+  p0.dst = tg_fp32;
+  p0.layer_id = 0;
+  cvk_ctx->ops->tdma_l2g_tensor_fill_constant(cvk_ctx, &p0);
+
+  // 2. sys->sys
+  cvk_tdma_g2g_tensor_copy_param_t p1;
+  cvk_tg_shape_t shape = tg_fp32->shape;  // backup
+  cvk_tg_stride_t stride = tg_fp32->stride;
+
+  tg_fp32->shape.w = 1;
+  tg_fp32->shape.h = tg_bf16->shape.h * tg_bf16->shape.w;
+  tg_fp32->stride.h = 4;
+
+  tg_fp32->start_address = tg_fp32->start_address + 2;  // +2 means shift from high part
+  p1.src = tg_bf16;
+  p1.dst = tg_fp32;
+  p1.layer_id = 0;
+  cvk_ctx->ops->tdma_g2g_bf16_tensor_copy(cvk_ctx, &p1);
+
+  // restore
+  tg_fp32->start_address = tg_fp32->start_address - 2;
+  tg_fp32->shape = shape;
+  tg_fp32->stride = stride;
+#endif
+}
diff --git a/cvimath/src/gen_lut.h b/cvimath/src/gen_lut.h
new file mode 100644
index 000000000..8c58a0c8c
--- /dev/null
+++ b/cvimath/src/gen_lut.h
@@ -0,0 +1,207 @@
+#ifndef GEN_LUT_1880v2_H
+#define GEN_LUT_1880v2_H
+
+#include <assert.h>
+#include <cvimath_internal.h>
+#include <test_cvikernel_util.h>
+
+#define IN
+#define OUT
+#define ASSERT(x) assert(x)
+
+static inline int cvm_exp_start() { return -62; }
+static inline int cvm_exp_end() { return 63; }
+static inline int cvm_table_h() { return 32; }
+static inline int cvm_table_w() { return 8; }
+static inline int cvm_table_hw() { return cvm_table_h() * cvm_table_w(); }
+static inline int half_h_table() { return cvm_table_h() * cvm_table_w() / 2; }
+static inline bool is_1880v2_tbl_shape(cvk_tl_shape_t *s) {
+  // FIXME: h could be reduce less than 32
+  assert(s->h == (uint32_t)cvm_table_h() && s->w == (uint32_t)cvm_table_w() &&
+         "table h/w should be 32/8");
+
+  return s->h == (uint32_t)cvm_table_h() && s->w == (uint32_t)cvm_table_w();
+}
+
+// copy cvk_tl_t structure
+static inline void bmk1880v2_tensor_lmem_s_copy(cvk_tl_t *dst, cvk_tl_t *src) {
+  dst->start_address = src->start_address;
+  dst->fmt = src->fmt;
+  dst->shape = src->shape;
+  dst->stride = src->stride;
+  dst->int8_rnd_mode = src->int8_rnd_mode;
+}
+
+static inline void bmk1880v2_tensor_lmem_s_copy_bf16_8(cvk_context_t *ctx, cvk_tl_t *dst,
+                                                       cvk_tl_t *src, cvk_fmt_t fmt) {
+  assert(src->fmt == CVK_FMT_BF16 && (fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8) &&
+         "only support bf16->i8/uint8_t, plz check fmt\n");
+
+  dst->start_address = src->start_address;
+  dst->fmt = fmt;
+  dst->shape = src->shape;
+  dst->shape.w *= 2;
+  dst->stride = ctx->ops->tl_default_stride(ctx, dst->shape, fmt, CTRL_NULL);
+  // dst->shape.h *= 2;
+  // dst->stride = ctx->ops->tl_default_stride(ctx, dst->shape,
+  //                                                        /*eu_align*/ 1,
+  //                                                        fmt);
+  // dst->shape.h = src->shape.h;
+  dst->int8_rnd_mode = src->int8_rnd_mode;
+}
+
+// l2l means we keep the same shape between bf16/(u)int8
+static inline void bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(cvk_context_t *ctx, cvk_tl_t *dst,
+                                                           cvk_tl_t *src, cvk_fmt_t fmt) {
+  assert(src->fmt == CVK_FMT_BF16 && (fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8) &&
+         "only support bf16->i8/uint8_t, plz check fmt\n");
+
+  dst->start_address = src->start_address;
+  dst->fmt = fmt;
+  dst->shape = src->shape;
+  dst->stride = ctx->ops->tl_default_stride(ctx, dst->shape, fmt, CTRL_NULL);
+  dst->int8_rnd_mode = src->int8_rnd_mode;
+}
+
+int cvm_emit_square(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *OUT tl_ofmap_bf16,
+                    cvk_fmt_t fmt);
+
+void cvm_table_check(cvk_tl_t *IN tl_ifmap, cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa,
+                     cvk_tl_t *OUT tl_ofmap_bf16);
+
+int cvm_lut_exp_mantissa(cvk_context_t *ctx, cvk_tl_t *IN tl_ifmap, cvk_tl_t *IN tl_buf,
+                         cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa,
+                         cvk_tl_t *OUT tl_ofmap_bf16);
+
+void cvm_get_uint8_t_tbl_idx(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *OUT tl_ofmap_bf16);
+
+void cvm_get_dec(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                 cvk_tl_t *OUT tl_ofmap_bf16);
+
+void cvm_get_dec_fractions(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *OUT buf,
+                           cvk_tl_t *OUT tl_ofmap_bf16);
+
+int cvm_emit_abs(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *OUT tl_ofmap_bf16,
+                 cvk_fmt_t fmt);
+
+int _cvm_lut_exp_mantissa(cvk_context_t *ctx, cvk_tl_t *IN tl_ifmap, cvk_tl_t *IN tl_buf,
+                          cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa,
+                          cvk_tl_t *OUT tl_ofmap_bf16, bool is_dirty_ifmap);
+
+int _cvm_atan_fast_emit(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2,
+                        cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf,
+                        cvk_tl_t *tl_table_answer, cvk_tl_t *tl_table_answer_mantissa,
+                        cvk_tl_t *OUT tl_ofmap_bf16, cvk_fmt_t fmt, float b, bool is_dirty_ifmap);
+
+int cvm_emit_x_over_y(cvk_context_t *ctx, cvk_tl_t *IN x, cvk_tl_t *IN y, cvk_tl_t *IN tl_buf,
+                      cvk_tl_t *OUT tl_ofmap_bf16, cvk_tl_t *tl_table_answer,
+                      cvk_tl_t *tl_table_answer_mantissa, cvk_fmt_t fmt, bool is_dirty_ifmap);
+
+int _cvm_emit_mask(cvk_context_t *ctx, cvk_tl_t *IN tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2,
+                   cvk_tl_t *tl_buf3, cvk_tl_t *tl_pos_neg_table, cvk_tl_t *tl_0_idx_table,
+                   cvk_tl_t *OUT tl_ofmap_bf16, cvk_fmt_t fmt, enum CVM_MASK_TYPE mask,
+                   bool is_dirty_ifmap);
+
+void _cvm_get_tbl_idx(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *OUT tl_ofmap_bf16,
+                      cvk_fmt_t src_fmt, int int8_rnd_mode);
+int __cvm_atan_fast_emit(cvk_context_t *ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                         cvk_tl_t *tl_buf2, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_invert_buf,
+                         cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer,
+                         cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *OUT tl_ofmap_bf16,
+                         cvk_fmt_t fmt);
+
+// not need to export to user
+// mask please refer \CVM_MASK_TYPE for supported case
+int cvm_emit_mask_gt0(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                      cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_pos_neg_buf,
+                      cvk_tl_t *tl_0_idx_buf, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+
+int cvm_emit_mask_ge0(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                      cvk_tl_t *tl_pos_neg_table, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+
+int cvm_emit_mask_le0(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                      cvk_tl_t *tl_pos_neg_table, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+
+int cvm_emit_mask_eq0(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                      cvk_tl_t *tl_0_idx_table, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+
+int cvm_emit_mask_lt0(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                      cvk_tl_t *tl_pos_neg_table, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+
+int _cvm_atan_emit(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2,
+                   cvk_tl_t *tl_buf3, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_slope_buf,
+                   cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer,
+                   cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt,
+                   float b);
+
+void cvm_emit_mask_ge0_lt0(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *index_i8,
+                           cvk_tl_t *tl_buf3, cvk_fmt_t fmt);
+
+void cvm_emit_mask_eq_0(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *tl_buf, cvk_tl_t *index_i8,
+                        cvk_tl_t *tl_buf3, cvk_fmt_t fmt);
+
+int cvm_lut_exp_mantissa(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                         cvk_tl_t *tbl_answer, cvk_tl_t *tbl_answer_mantissa,
+                         cvk_tl_t *tl_ofmap_bf16);
+
+int cvm_emit_pythagoras(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf,
+                        cvk_tl_t *tl_buf2, cvk_tl_t *tl_sqrt_table_answer,
+                        cvk_tl_t *tl_sqrt_table_answer_mantissa, cvk_tl_t *tl_ofmap_bf16,
+                        cvk_fmt_t fmt);
+
+int cvm_emit_max_const(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ofmap_bf16,
+                       cvk_fmt_t fmt, float b);
+
+int cvm_emit_min_const(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ofmap_bf16,
+                       cvk_fmt_t fmt, float b);
+
+int cvm_emit_0_1_revert(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                        cvk_tl_t *tbl_answer, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+
+int cvm_emit_mul(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
+                 cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+
+int cvm_emit_add(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ifmap2,
+                 cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+
+int cvm_emit_add_const(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ofmap_bf16,
+                       cvk_fmt_t fmt, float b);
+
+int cvm_emit_mul_const(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ofmap_bf16,
+                       cvk_fmt_t fmt, float b);
+// not release yet
+
+void cvm_atan2_emit(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf,
+                    cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_buf4, cvk_tl_t *tl_buf5,
+                    cvk_tl_t *tl_buf6, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_slope_buf,
+                    cvk_tl_t *tl_invert_buf, cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer,
+                    cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_sqrt_table_answer,
+                    cvk_tl_t *tl_sqrt_table_answer_mantissa, cvk_tl_t *tl_0_idx_table,
+                    cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+
+int cvm_atan_slope_multipilier(cvk_context_t *cvk_ctx, cvk_tl_t *tl_buf, cvk_tl_t *tl_buf2,
+                               cvk_tl_t *tl_buf3, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_ofmap_bf16,
+                               cvk_fmt_t fmt);
+
+int cvm_atan_fast_emit(cvk_context_t *cvk_ctx, cvk_tl_t *tl_ifmap, cvk_tl_t *tl_buf,
+                       cvk_tl_t *tl_buf2, cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_invert_buf,
+                       cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer,
+                       cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt,
+                       bool is_dirty_ifmap);
+
+void cvm_atan2_fast_emit(cvk_context_t *cvk_ctx, cvk_tl_t *y, cvk_tl_t *x, cvk_tl_t *tl_buf,
+                         cvk_tl_t *tl_buf2, cvk_tl_t *tl_buf3, cvk_tl_t *tl_buf4,
+                         cvk_tl_t *tl_y0_buf, cvk_tl_t *tl_slope_buf, cvk_tl_t *tl_invert_buf,
+                         cvk_tl_t *tl_pos_neg_buf, cvk_tl_t *tl_table_answer,
+                         cvk_tl_t *tl_table_answer_mantissa, cvk_tl_t *tl_0_idx_table,
+                         cvk_tl_t *tl_ofmap_bf16, cvk_fmt_t fmt);
+
+// conv used
+int cvm_reshape_channel_same_pad(
+    cvk_context_t *cvk_ctx, int ic, int ih, int iw, int kh, int kw, int pad_right, int pad_left,
+    int stride_h, int stride_w, cvk_tl_shape_t *tl_load_shape, cvk_tl_stride_t *new_tl_ifmap_stride,
+    cvk_tg_shape_t *new_tg_ifmap_shape, cvk_tg_stride_t *new_tg_ifmap_stride,
+    cvk_tl_shape_t *new_tl_weight_shape, cvk_tl_shape_t *new_tl_bias_shape,
+    cvk_tl_shape_t *new_tl_ofmap_shape, cvk_fmt_t fmt, int eu_align);
+
+#endif /* GEN_LUT_1880v2_H */
diff --git a/cvimath/src/set_val_by_mask.c b/cvimath/src/set_val_by_mask.c
new file mode 100644
index 000000000..33c976a79
--- /dev/null
+++ b/cvimath/src/set_val_by_mask.c
@@ -0,0 +1,1169 @@
+#include <cvimath_internal.h>
+#include "gen_lut.h"
+
+static inline int check_u8(cvk_tl_t* a, cvk_tl_t* b, cvk_tl_t* c) {
+  return (a->fmt == CVK_FMT_U8 && b->fmt == CVK_FMT_U8 && c->fmt == CVK_FMT_U8);
+}
+
+static inline int check_same_fmt(cvk_tl_t* a, cvk_tl_t* b, cvk_tl_t* c) {
+  return a->fmt == b->fmt && b->fmt == c->fmt;
+}
+
+int cvm_set_image_by_u8mask(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf,
+                            cvk_tl_t* tl_mask, cvk_tl_t* tl_ofmap) {
+  int ret = 0;
+  cvk_fmt_t fmt = tl_ifmap->fmt;
+  if (!check_u8(tl_ifmap, tl_buf, tl_ofmap)) {
+    // throw config error
+    printf("input/buf/ofmap format should config CVK_FMT_U8\n");
+    return -1;
+  }
+
+  if (!check_same_fmt(tl_ifmap, tl_buf, tl_mask) && !check_same_fmt(tl_ifmap, tl_buf, tl_ofmap)) {
+    printf("all tensor's fmt shoud be equal\n");
+    return -2;
+  }
+
+  cvk_tl_t* high = tl_buf;
+  if (tl_ofmap->fmt == CVK_FMT_BF16) {
+    // TODO: support it
+    high = NULL;
+  } else if (tl_ofmap->fmt == CVK_FMT_U8) {
+    // hw limitation that input should be i8
+    tl_ofmap->fmt = tl_buf->fmt = tl_mask->fmt = CVK_FMT_I8;
+    cvm_emit_mul_const(ctx, high, high, high->fmt, 0);
+  } else {
+    printf("not support fmt\n");
+    return -3;
+  }
+
+  // revert mask to set selected one as 0
+  cvm_emit_mul_const(ctx, tl_mask, tl_mask, tl_mask->fmt, -1);
+
+  // set mask selected one as 0
+  // e.g: -1 - (-1) for this cast that turn to -1 * -1 + 255(0xff) = 256, get low part as 0
+  cvk_tiu_mac_param_t p2;
+  p2.res_high = high;
+  p2.res_low = tl_ofmap;
+  p2.res_is_int8 = 0;  // keep origin
+  p2.a = tl_ofmap;
+  p2.b_is_const = 0;
+  p2.b = tl_mask;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+
+  ctx->ops->tiu_mac(ctx, &p2);
+
+  // revert back
+  cvm_emit_mul_const(ctx, tl_mask, tl_mask, tl_mask->fmt, -1);
+
+  // overwrite selected one
+  p2.res_high = high;
+  p2.res_low = tl_ofmap;
+  p2.res_is_int8 = 0;  // keep origin
+  p2.a = tl_ifmap;
+  p2.b_is_const = 0;
+  p2.b = tl_mask;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+
+  ctx->ops->tiu_mac(ctx, &p2);
+
+  // restore
+  tl_ifmap->fmt = tl_buf->fmt = tl_mask->fmt = tl_ofmap->fmt = fmt;
+
+  return ret;
+}
+
+// dp means depthwise version
+int cvm_set_image_by_u8mask_dp(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_mask,
+                               cvk_tl_t* tl_kernel, cvk_tl_t* tl_bias, cvk_tl_t* tl_ofmap) {
+  int ret = 0;
+  cvk_fmt_t fmt = tl_ifmap->fmt;
+  if (!check_u8(tl_ifmap, tl_ifmap, tl_ofmap)) {
+    // throw config error
+    printf("input/buf/ofmap format should config CVK_FMT_U8\n");
+    return -1;
+  }
+
+  if (!check_same_fmt(tl_ifmap, tl_ifmap, tl_mask) &&
+      !check_same_fmt(tl_ifmap, tl_ifmap, tl_ofmap)) {
+    printf("all tensor's fmt shoud be equal\n");
+    return -2;
+  }
+
+  if (tl_ofmap->fmt == CVK_FMT_BF16) {
+    // TODO: support it
+  } else if (tl_ofmap->fmt == CVK_FMT_U8) {
+    // hw limitation that input should be i8
+    tl_ifmap->fmt = tl_ofmap->fmt = tl_mask->fmt = CVK_FMT_I8;
+  } else {
+    printf("not support fmt\n");
+    return -3;
+  }
+
+  // mask 1 means overwrite new one, 0 means keep old
+  // if = if * mask
+  // mask = depthwise(mask) * kernel_1x1 + bias, kernel set to -1, bias set to 1
+  // of = of * mask
+  // mask = mask * 0 // reset high part
+  // mask_of = of + 1 * if
+
+  cvm_emit_mul(ctx, tl_ifmap, tl_mask, tl_ifmap, tl_mask->fmt);
+
+  // revert 0/1 to 1/0
+  cvk_tiu_depthwise_pt_convolution_param_t param;
+  param.ofmap = tl_mask;
+  param.ifmap = tl_mask;
+  param.weight = tl_kernel;
+  param.bias = tl_bias;
+  param.ins_h = 0;
+  param.ins_last_h = 0;
+  param.ins_w = 0;
+  param.ins_last_w = 0;
+  param.stride_h = 1;
+  param.stride_w = 1;
+  param.dilation_h = 1;
+  param.dilation_w = 1;
+  param.pad_top = 0;
+  param.pad_bottom = 0;
+  param.pad_left = 0;
+  param.pad_right = 0;
+  param.relu_enable = 0;
+  param.rshift_bits = 0;
+  param.ins_val = 0;  // symmetric quantization
+  param.ins_fp = 0;   // symmetric quantization
+  ctx->ops->tiu_pt_depthwise_convolution(ctx, &param);
+
+  // keep of
+  cvm_emit_mul(ctx, tl_ofmap, tl_mask, tl_ofmap, tl_mask->fmt);
+
+  // reset high part
+  cvm_emit_mul_const(ctx, tl_mask, tl_mask, tl_mask->fmt, 0);
+
+  cvk_tiu_mac_param_t p2;
+  p2.res_high = tl_mask;
+  p2.res_low = tl_ofmap;
+  p2.res_is_int8 = 0;  // keep origin
+  p2.a = tl_ifmap;
+  p2.b_is_const = 1;
+  p2.b_const.val = 1;
+  p2.b_const.is_signed = 1;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+
+  ctx->ops->tiu_mac(ctx, &p2);
+
+  // restore
+  tl_ifmap->fmt = tl_mask->fmt = tl_ofmap->fmt = fmt;
+
+  return ret;
+}
+
+// \is_less = 1 means that 1 indicate less and 0 is greater equal \threshold
+// \is_less = 0 means that 1 indicate greater equal \threshold and 0 indicate less
+static void __get_less_large_mask(cvk_context_t* ctx, cvk_tl_t* buf, cvk_tl_t* buf2,
+                                  cvk_tl_t* tl_update_tbl, uint8_t threshold, bool is_less) {
+  bool is_signed = buf->fmt == CVK_FMT_I8;
+
+  // keep tl_update_tbl < threshold
+  // mul to hoist int16 and add it with sign bit
+  // TODO: try not use high part
+  cvk_tiu_mul_param_t p1 = {0};
+  p1.res_high = buf2;
+  p1.res_low = buf;
+  p1.a = tl_update_tbl;
+  p1.b_const.val = 1;
+  p1.b_const.is_signed = is_signed;
+  p1.b_is_const = true;
+  p1.rshift_bits = 0;
+  p1.layer_id = 0;
+  p1.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  // just check sign bit for > thres or not
+  // i16 diff
+  cvk_fmt_t fmt = buf->fmt;
+  buf2->fmt = buf->fmt = CVK_FMT_I8;
+  is_signed = true;
+
+  // e.g: 10 - 6 = 4, 2 - 6 = -4
+  cvk_tiu_add_param_t p4;
+  p4.res_high = 0;  // saturatue to int8
+  p4.res_low = buf;
+  p4.a_high = buf2;
+  p4.a_low = buf;
+  p4.b_is_const = 1;
+  p4.b_const.val = -1 * (threshold);
+  p4.b_const.is_signed = is_signed;
+  p4.rshift_bits = 0;
+  p4.relu_enable = 0;
+  ctx->ops->tiu_add(ctx, &p4);
+
+  // saturate to int max
+  // 4 * -127 = -128,  -4 * -127 = 127
+  // hex represent is 0x80, 0x7F
+  cvk_tiu_mul_param_t p;
+  p.res_high = 0;
+  p.res_low = buf;
+  p.a = buf;
+  p.b_is_const = 1;
+  p.b_const.val = -127;  // revert to > 0
+  if (!is_less) {
+    p.b_const.val = 127;
+  }
+  p.b_const.is_signed = is_signed;
+  p.rshift_bits = 0;
+  p.relu_enable = is_signed;
+  ctx->ops->tiu_mul(ctx, &p);
+
+  // set as mask(127->1)
+  // hex represent is 0x80, 0x7F, right shift 7
+  // 0x1, 0x0
+  p.res_high = 0;
+  p.res_low = buf;
+  p.a = buf;
+  p.b_is_const = 1;
+  p.b_const.val = 1;
+  p.b_const.is_signed = is_signed;
+  p.rshift_bits = 7;
+  p.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p);
+
+  // revert
+  buf2->fmt = buf->fmt = fmt;
+  return;
+}
+
+// \is_less = 1 means that 1 indicate less and 0 is greater equal \threshold
+// \is_less = 0 means that 1 indicate greater equal \threshold and 0 indicate less
+static void __get_less_large_mask_dp(cvk_context_t* ctx, cvk_tl_t* tl_update_tbl,
+                                     cvk_tl_t* tl_kernel, cvk_tl_t* tl_threshold, bool is_less) {
+  bool is_signed = tl_update_tbl->fmt == CVK_FMT_I8;
+
+  // 1. depthwise(tl_update_tbl) * kernel(1) + bias(-1 * threshold)
+  // 2. mul to saturate it
+  // 3. right shift
+
+  cvk_tiu_depthwise_pt_convolution_param_t param;
+  param.ofmap = tl_update_tbl;
+  param.ifmap = tl_update_tbl;
+  param.weight = tl_kernel;
+  param.bias = tl_threshold;
+  param.ins_h = 0;
+  param.ins_last_h = 0;
+  param.ins_w = 0;
+  param.ins_last_w = 0;
+  param.stride_h = 1;
+  param.stride_w = 1;
+  param.dilation_h = 1;
+  param.dilation_w = 1;
+  param.pad_top = 0;
+  param.pad_bottom = 0;
+  param.pad_left = 0;
+  param.pad_right = 0;
+  param.relu_enable = 0;
+  param.rshift_bits = 0;
+  param.ins_val = 0;  // symmetric quantization
+  param.ins_fp = 0;   // symmetric quantization
+  ctx->ops->tiu_pt_depthwise_convolution(ctx, &param);
+
+  cvk_fmt_t fmt = tl_update_tbl->fmt;
+  tl_update_tbl->fmt = CVK_FMT_I8;
+  is_signed = true;
+
+  // saturate to int max
+  // 4 * -127 = -128,  -4 * -127 = 127
+  // hex represent is 0x80, 0x7F
+  cvk_tiu_mul_param_t p;
+  p.res_high = 0;
+  p.res_low = tl_update_tbl;
+  p.a = tl_update_tbl;
+  p.b_is_const = 1;
+  p.b_const.val = -127;  // revert to > 0
+  if (!is_less) {
+    p.b_const.val = 127;
+  }
+  p.b_const.is_signed = is_signed;
+  p.rshift_bits = 0;
+  p.relu_enable = is_signed;
+  ctx->ops->tiu_mul(ctx, &p);
+
+  // set as mask(127->1)
+  // hex represent is 0x80, 0x7F, right shift 7
+  // 0x1, 0x0
+  p.res_high = 0;
+  p.res_low = tl_update_tbl;
+  p.a = tl_update_tbl;
+  p.b_is_const = 1;
+  p.b_const.val = 1;
+  p.b_const.is_signed = is_signed;
+  p.rshift_bits = 7;
+  p.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p);
+
+  // revert
+  tl_update_tbl->fmt = fmt;
+  return;
+}
+/**
+ * \high as output
+ */
+static void _get_less_large_mask(cvk_context_t* ctx, cvk_tl_t* buf, cvk_tl_t* buf2,
+                                 cvk_tl_t* tl_update_tbl, uint8_t threshold, bool is_less) {
+  bool is_signed = buf->fmt == CVK_FMT_I8;
+  // keep tl_update_tbl < threshold
+  // mul to hoist int16 and add it with sign bit
+  // TODO: try not use high part
+  cvk_tiu_mul_param_t p1 = {0};
+  p1.res_high = buf2;
+  p1.res_low = buf;
+  p1.a = tl_update_tbl;
+  p1.b_const.val = 1;
+  p1.b_const.is_signed = is_signed;
+  p1.b_is_const = true;
+  p1.rshift_bits = 0;
+  p1.layer_id = 0;
+  p1.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  // just check sign bit for > thres or not
+  // i16 diff
+  cvk_fmt_t fmt = buf->fmt;
+  buf2->fmt = buf->fmt = CVK_FMT_I8;
+  is_signed = true;
+
+  cvk_tiu_add_param_t p4;
+  p4.res_high = 0;  // saturatue to int8
+  p4.res_low = buf;
+  p4.a_high = buf2;
+  p4.a_low = buf;
+  p4.b_is_const = 1;
+  p4.b_const.val = -1 * (threshold);
+  p4.b_const.is_signed = is_signed;
+  p4.rshift_bits = 0;
+  p4.relu_enable = 0;
+  ctx->ops->tiu_add(ctx, &p4);
+
+  // saturate to int max
+  cvk_tiu_mul_param_t p;
+  p.res_high = 0;
+  p.res_low = buf;
+  p.a = buf;
+  p.b_is_const = 1;
+  p.b_const.val = -127;  // revert to > 0
+  if (!is_less) {
+    p.b_const.val = 127;
+  }
+  p.b_const.is_signed = is_signed;
+  p.rshift_bits = 0;
+  p.relu_enable = is_signed;
+  ctx->ops->tiu_mul(ctx, &p);
+
+  // set as mask(127->1)
+  p.res_high = 0;
+  p.res_low = buf;
+  p.a = buf;
+  p.b_is_const = 1;
+  p.b_const.val = 1;
+  p.b_const.is_signed = is_signed;
+  p.rshift_bits = 7;
+  p.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p);
+
+  // revert
+  buf2->fmt = buf->fmt = fmt;
+  return;
+}
+
+static void _get_less_mask(cvk_context_t* ctx, cvk_tl_t* buf, cvk_tl_t* buf2,
+                           cvk_tl_t* tl_update_tbl, uint8_t threshold) {
+  _get_less_large_mask(ctx, buf, buf2, tl_update_tbl, threshold, /*is_less=*/1);
+}
+
+static void _get_large_mask(cvk_context_t* ctx, cvk_tl_t* buf, cvk_tl_t* buf2,
+                            cvk_tl_t* tl_update_tbl, uint8_t threshold) {
+  _get_less_large_mask(ctx, buf, buf2, tl_update_tbl, threshold, /*is_less=*/0);
+}
+
+int cvm_set_image_by_two_info_i8(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf2,
+                                 cvk_tl_t* tl_mask, cvk_tl_t* tl_update_tbl, uint8_t threshold,
+                                 cvk_tl_t* tl_ofmap) {
+  cvk_fmt_t fmt = tl_ifmap->fmt;
+  if (!check_u8(tl_ifmap, tl_buf2, tl_update_tbl)) {
+    // throw config error
+    printf("input/buf/tl_update_tbl format should config CVK_FMT_U8\n");
+    return -1;
+  }
+
+  if (!check_same_fmt(tl_ifmap, tl_buf2, tl_mask)) {
+    printf("all tensor's fmt shoud be equal\n");
+    return -2;
+  }
+
+  if (tl_ofmap->fmt == CVK_FMT_BF16) {
+  } else if (tl_ofmap->fmt == CVK_FMT_U8) {
+    // hw limitation that input should be i8
+    // tl_update_tbl->fmt = tl_buf->fmt = tl_mask->fmt = tl_ofmap->fmt = CVK_FMT_I8;
+    tl_buf2->fmt = tl_update_tbl->fmt = tl_ofmap->fmt = tl_mask->fmt = CVK_FMT_I8;
+  } else {
+    printf("not support fmt\n");
+    return -3;
+  }
+
+  __get_less_large_mask(ctx, tl_update_tbl, tl_buf2, tl_update_tbl, threshold, 1);
+
+  // set new mask
+  cvm_emit_mul(ctx, tl_mask, tl_update_tbl, tl_mask, tl_mask->fmt);
+
+  // restore
+  tl_buf2->fmt = tl_update_tbl->fmt = tl_mask->fmt = tl_ofmap->fmt = fmt;
+
+  return cvm_set_image_by_u8mask(ctx, tl_ifmap, tl_buf2, tl_mask, tl_ofmap);
+}
+
+int cvm_set_image_by_two_info_i8_dp(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_kernel,
+                                    cvk_tl_t* tl_mask, cvk_tl_t* tl_update_tbl,
+                                    cvk_tl_t* tl_threshold, cvk_tl_t* tl_ofmap) {
+  cvk_fmt_t fmt = tl_ifmap->fmt;
+  if (!check_u8(tl_ifmap, tl_ofmap, tl_update_tbl)) {
+    // throw config error
+    printf("input/buf/tl_update_tbl format should config CVK_FMT_U8\n");
+    return -1;
+  }
+
+  if (!check_same_fmt(tl_ifmap, tl_ofmap, tl_mask)) {
+    printf("all tensor's fmt shoud be equal\n");
+    return -2;
+  }
+
+  if (tl_update_tbl->shape.h <= 1) {
+    printf("tl_update_tbl will be as bias high part, the high should be >= 2\n");
+    return -3;
+  }
+
+  if (tl_ofmap->fmt == CVK_FMT_BF16) {
+  } else if (tl_ofmap->fmt == CVK_FMT_U8) {
+    // hw limitation that input should be i8
+    // tl_update_tbl->fmt = tl_buf->fmt = tl_mask->fmt = tl_ofmap->fmt = CVK_FMT_I8;
+    tl_update_tbl->fmt = tl_mask->fmt = CVK_FMT_I8;
+  } else {
+    printf("not support fmt\n");
+    return -3;
+  }
+
+  __get_less_large_mask_dp(ctx, tl_update_tbl, tl_kernel, tl_threshold, 1);
+
+  // set new mask
+  cvm_emit_mul(ctx, tl_mask, tl_update_tbl, tl_mask, tl_mask->fmt);
+
+  // dirty bias set to 1
+  // tl_bias = tl_bias * 0
+  // tl_update_tbl = tl_update_tbl * 0
+  // tl_update-tbl_tl_bias = tl_update_tbl-tl_bias + 1, reshape tl_update_tbl, set to 1
+  // tbl_tl_bias = tl_update copy high part to tbl_tl_bias high part, stride w = 2
+  cvm_emit_mul_const(ctx, tl_threshold, tl_threshold, tl_threshold->fmt, 0);
+  cvm_emit_mul_const(ctx, tl_update_tbl, tl_update_tbl, tl_update_tbl->fmt, 0);
+
+  cvk_tl_stride_t tl_update_tbl_st = tl_update_tbl->stride;
+  tl_update_tbl->stride = tl_threshold->stride;
+  tl_update_tbl->shape = tl_threshold->shape;
+
+  cvk_tiu_add_param_t p4;
+  p4.res_high = 0;
+  p4.res_low = tl_threshold;
+  p4.a_high = tl_update_tbl;
+  p4.a_low = tl_threshold;
+  p4.b_is_const = 1;
+  p4.b_const.val = 1;
+  p4.b_const.is_signed = 1;
+  p4.rshift_bits = 0;
+  p4.relu_enable = 0;
+  ctx->ops->tiu_add(ctx, &p4);
+
+  // clean high part
+  tl_threshold->start_address++;  // continuous low/high
+  tl_update_tbl->shape.n = 1;
+  cvm_emit_mul_const(ctx, tl_threshold, tl_threshold, tl_threshold->fmt, 0);
+  tl_threshold->start_address--;  // restore
+
+  // restore
+  tl_update_tbl->fmt = tl_mask->fmt = fmt;
+  tl_update_tbl->stride = tl_update_tbl_st;
+  tl_update_tbl->shape = tl_mask->shape;
+
+  // set to -1 for \cvm_set_image_by_u8mask_dp
+  cvm_emit_mul_const(ctx, tl_kernel, tl_kernel, tl_kernel->fmt, -1);
+
+  return cvm_set_image_by_u8mask_dp(ctx, tl_ifmap, tl_mask, tl_kernel, tl_threshold, tl_ofmap);
+}
+
+int cvm_gen_image_diff(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_ifmap2,
+                       cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2, cvk_tl_t* tl_ofmap) {
+  int ret = 0;
+  cvk_fmt_t fmt = tl_ifmap->fmt;
+  if (!check_u8(tl_ifmap, tl_ifmap2, tl_ofmap)) {
+    // throw config error
+    printf("input/buf/ofmap format should config CVK_FMT_U8\n");
+    return -1;
+  }
+
+  if (!check_same_fmt(tl_ifmap, tl_buf, tl_buf2)) {
+    printf("all tensor's fmt shoud be equal\n");
+    return -2;
+  }
+
+  if (tl_ofmap->fmt == CVK_FMT_BF16) {
+    // TODO: support it
+  } else if (tl_ofmap->fmt == CVK_FMT_U8) {
+  } else {
+    printf("not support fmt\n");
+    return -3;
+  }
+
+  // get large one
+  cvk_tiu_max_param_t p13 = {0};
+  p13.max = tl_buf;
+  p13.a = tl_ifmap;
+  p13.b_is_const = 0;
+  p13.b = tl_ifmap2;
+  p13.layer_id = 0;
+  ctx->ops->tiu_max(ctx, &p13);
+
+  // compare to get a > b or a < b, 1 means a > b
+  // cvk_tiu_sub_param_t p5;
+  // p5.res_high = 0; // saturatue to int8
+  // p5.res_low = tl_ofmap;
+  // p5.a_high= tl_buf2;
+  // p5.a_low = tl_buf;
+  // p5.b_high = tl_buf2;
+  // p5.b_low = tl_ifmap2;
+  // p5.rshift_bits = 0;
+  // ctx->ops->tiu_sub(ctx, &p5);
+  tl_ifmap2->fmt = tl_buf->fmt = tl_buf2->fmt = CVK_FMT_I8;
+  cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, tl_buf2->fmt, 0);
+  cvk_tiu_mac_param_t p2;
+  p2.res_high = tl_buf2;
+  p2.res_low = tl_buf;
+  p2.res_is_int8 = 0;  // keep origin
+  p2.a = tl_ifmap2;
+  p2.b_is_const = 1;
+  p2.b_const.val = -1;
+  p2.b_const.is_signed = 1;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+  ctx->ops->tiu_mac(ctx, &p2);
+
+  // mul 255 and rightshift to get 0/1, 1 means tl_ifmap > tl_ifmap2
+  // get positive
+  tl_buf->fmt = CVK_FMT_U8;
+  cvk_tiu_mul_param_t p1 = {0};
+  p1.res_high = 0;
+  p1.res_low = tl_buf;
+  p1.a = tl_buf;
+  p1.b_const.val = 255;
+  p1.b_const.is_signed = 0;
+  p1.b_is_const = true;
+  p1.rshift_bits = 0;
+  p1.layer_id = 0;
+  p1.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  // that max = 127
+  tl_buf->fmt = CVK_FMT_I8;
+  p1.res_high = 0;
+  p1.res_low = tl_buf;
+  p1.a = tl_buf;
+  p1.b_const.val = -127;
+  p1.b_const.is_signed = 1;
+  p1.b_is_const = true;
+  p1.rshift_bits = 0;
+  p1.layer_id = 0;
+  p1.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  // 127 >> 7 to 0/1
+  p1.res_high = 0;
+  p1.res_low = tl_buf;
+  p1.a = tl_buf;
+  p1.b_const.val = 1;
+  p1.b_const.is_signed = 1;
+  p1.b_is_const = true;
+  p1.rshift_bits = 7;
+  p1.layer_id = 0;
+  p1.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  tl_ifmap->fmt = tl_ofmap->fmt = tl_buf->fmt = CVK_FMT_I8;
+  // keep a that a > b
+  cvm_emit_mul(ctx, tl_buf, tl_ifmap, tl_ofmap, tl_ofmap->fmt);
+
+  // mul -1 for get - b under a > b
+  cvm_emit_mul_const(ctx, tl_buf, tl_buf, tl_buf->fmt, -1);
+
+  // get a - b = a + (-1) * b
+  // cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, tl_buf2->fmt, 0);
+  p2.res_high = tl_buf2;  // dont care add garbage
+  p2.res_low = tl_ofmap;
+  p2.res_is_int8 = 0;  // keep origin
+  p2.a = tl_ifmap2;
+  p2.b_is_const = 0;
+  p2.b = tl_buf;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+  ctx->ops->tiu_mac(ctx, &p2);
+  cvm_emit_mul_const(ctx, tl_ofmap, tl_ofmap, tl_ofmap->fmt, 1);
+
+  // hoist to int16
+  tl_buf2->fmt = CVK_FMT_I8;
+  p1.res_high = tl_buf2;
+  p1.res_low = tl_buf;
+  p1.a = tl_buf;
+  p1.b_const.val = 1;
+  p1.b_const.is_signed = 1;
+  p1.b_is_const = true;
+  p1.rshift_bits = 0;
+  p1.layer_id = 0;
+  p1.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  // get revert 0/-1 to 1/0, get a < b case
+  cvk_tiu_add_param_t p4;
+  p4.res_high = 0;
+  p4.res_low = tl_buf;
+  p4.a_high = tl_buf2;
+  p4.a_low = tl_buf;
+  p4.b_is_const = 1;
+  p4.b_const.val = 1;
+  p4.b_const.is_signed = 1;
+  p4.rshift_bits = 0;
+  p4.relu_enable = 0;
+  ctx->ops->tiu_add(ctx, &p4);
+
+  // remove a < b in b
+  cvm_emit_mul(ctx, tl_buf, tl_ifmap2, tl_ifmap2, tl_ifmap2->fmt);
+
+  // mul -1 for -a
+  cvm_emit_mul_const(ctx, tl_buf, tl_buf, tl_buf->fmt, -1);
+
+  // a<b part, b + a * -1
+  p2.res_high = tl_buf2;  // dont care add garbage
+  p2.res_low = tl_ifmap2;
+  p2.res_is_int8 = 0;  // keep origin
+  p2.a = tl_ifmap;
+  p2.b_is_const = 0;
+  p2.b = tl_buf;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+  ctx->ops->tiu_mac(ctx, &p2);
+
+  // output is u8, a > b part merge a < b
+  p2.res_high = tl_buf2;  // dont care add garbage
+  p2.res_low = tl_ofmap;
+  p2.res_is_int8 = 0;  // keep origin
+  p2.a = tl_ifmap2;
+  p2.b_is_const = 1;
+  p2.b_const.val = 1;
+  p2.b_const.is_signed = 0;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+  ctx->ops->tiu_mac(ctx, &p2);
+
+  // restore
+  tl_buf->fmt = tl_buf2->fmt = tl_ifmap->fmt = tl_ifmap2->fmt = tl_ofmap->fmt = fmt;
+
+  return ret;
+}
+
+int cvm_gen_image_diff_dp(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_ifmap2,
+                          cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2, cvk_tl_t* tl_ofmap) {
+  int ret = 0;
+  cvk_fmt_t fmt = tl_ifmap->fmt;
+  if (!check_u8(tl_ifmap, tl_ifmap2, tl_ofmap)) {
+    // throw config error
+    printf("input/buf/ofmap format should config CVK_FMT_U8\n");
+    return -1;
+  }
+
+  if (!check_same_fmt(tl_ifmap, tl_buf, tl_buf2)) {
+    printf("all tensor's fmt shoud be equal\n");
+    return -2;
+  }
+
+  if (tl_ofmap->fmt == CVK_FMT_BF16) {
+    // TODO: support it
+  } else if (tl_ofmap->fmt == CVK_FMT_U8) {
+  } else {
+    printf("not support fmt\n");
+    return -3;
+  }
+
+  // tl_buf = max(\tl_ifmap, \tl_ifmap2)
+  // tl_buf2-tl_buf = tl_buf2-tl_buf + (- 1 * tl_ifmap2), == 0 means \tl_ifmap < \tl_ifmap2,
+  // otherwise \tl_ifmap > \tl_ifmap2 tl_buf = tl_buf * 255 to get 0/1, 1 means \tl_ifmap >
+  // \tl_ifmap3 get large one
+  cvk_tiu_max_param_t p13 = {0};
+  p13.max = tl_buf;
+  p13.a = tl_ifmap;
+  p13.b_is_const = 0;
+  p13.b = tl_ifmap2;
+  p13.layer_id = 0;
+  ctx->ops->tiu_max(ctx, &p13);
+
+  // compare to get a > b or a < b, 1 means a > b
+  // cvk_tiu_sub_param_t p5;
+  // p5.res_high = 0; // saturatue to int8
+  // p5.res_low = tl_ofmap;
+  // p5.a_high= tl_buf2;
+  // p5.a_low = tl_buf;
+  // p5.b_high = tl_buf2;
+  // p5.b_low = tl_ifmap2;
+  // p5.rshift_bits = 0;
+  // ctx->ops->tiu_sub(ctx, &p5);
+  tl_ifmap2->fmt = tl_buf->fmt = tl_buf2->fmt = CVK_FMT_I8;
+  cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, tl_buf2->fmt, 0);
+  cvk_tiu_mac_param_t p2;
+  p2.res_high = tl_buf2;
+  p2.res_low = tl_buf;
+  p2.res_is_int8 = 0;  // keep origin
+  p2.a = tl_ifmap2;
+  p2.b_is_const = 1;
+  p2.b_const.val = -1;
+  p2.b_const.is_signed = 1;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+  ctx->ops->tiu_mac(ctx, &p2);
+
+  // mul 255 and rightshift to get 0/1, 1 means tl_ifmap > tl_ifmap2
+  // get positive
+  tl_buf->fmt = CVK_FMT_U8;
+  cvk_tiu_mul_param_t p1 = {0};
+  p1.res_high = 0;
+  p1.res_low = tl_buf;
+  p1.a = tl_buf;
+  p1.b_const.val = 255;
+  p1.b_const.is_signed = 0;
+  p1.b_is_const = true;
+  p1.rshift_bits = 0;
+  p1.layer_id = 0;
+  p1.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  // that max = 127
+  tl_buf->fmt = CVK_FMT_I8;
+  p1.res_high = 0;
+  p1.res_low = tl_buf;
+  p1.a = tl_buf;
+  p1.b_const.val = -127;
+  p1.b_const.is_signed = 1;
+  p1.b_is_const = true;
+  p1.rshift_bits = 0;
+  p1.layer_id = 0;
+  p1.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  // 127 >> 7 to 0/1
+  p1.res_high = 0;
+  p1.res_low = tl_buf;
+  p1.a = tl_buf;
+  p1.b_const.val = 1;
+  p1.b_const.is_signed = 1;
+  p1.b_is_const = true;
+  p1.rshift_bits = 7;
+  p1.layer_id = 0;
+  p1.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  tl_ifmap->fmt = tl_ofmap->fmt = tl_buf->fmt = CVK_FMT_I8;
+  // keep a that a > b
+  cvm_emit_mul(ctx, tl_buf, tl_ifmap, tl_ofmap, tl_ofmap->fmt);
+
+  // mul -1 for get - b under a > b
+  cvm_emit_mul_const(ctx, tl_buf, tl_buf, tl_buf->fmt, -1);
+
+  // get a - b = a + (-1) * b
+  // cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, tl_buf2->fmt, 0);
+  p2.res_high = tl_buf2;  // dont care add garbage
+  p2.res_low = tl_ofmap;
+  p2.res_is_int8 = 0;  // keep origin
+  p2.a = tl_ifmap2;
+  p2.b_is_const = 0;
+  p2.b = tl_buf;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+  ctx->ops->tiu_mac(ctx, &p2);
+  cvm_emit_mul_const(ctx, tl_ofmap, tl_ofmap, tl_ofmap->fmt, 1);
+
+  // hoist to int16
+  tl_buf2->fmt = CVK_FMT_I8;
+  p1.res_high = tl_buf2;
+  p1.res_low = tl_buf;
+  p1.a = tl_buf;
+  p1.b_const.val = 1;
+  p1.b_const.is_signed = 1;
+  p1.b_is_const = true;
+  p1.rshift_bits = 0;
+  p1.layer_id = 0;
+  p1.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  // get revert 0/-1 to 1/0, get a < b case
+  cvk_tiu_add_param_t p4;
+  p4.res_high = 0;
+  p4.res_low = tl_buf;
+  p4.a_high = tl_buf2;
+  p4.a_low = tl_buf;
+  p4.b_is_const = 1;
+  p4.b_const.val = 1;
+  p4.b_const.is_signed = 1;
+  p4.rshift_bits = 0;
+  p4.relu_enable = 0;
+  ctx->ops->tiu_add(ctx, &p4);
+
+  // remove a < b in b
+  cvm_emit_mul(ctx, tl_buf, tl_ifmap2, tl_ifmap2, tl_ifmap2->fmt);
+
+  // mul -1 for -a
+  cvm_emit_mul_const(ctx, tl_buf, tl_buf, tl_buf->fmt, -1);
+
+  // a<b part, b + a * -1
+  p2.res_high = tl_buf2;  // dont care add garbage
+  p2.res_low = tl_ifmap2;
+  p2.res_is_int8 = 0;  // keep origin
+  p2.a = tl_ifmap;
+  p2.b_is_const = 0;
+  p2.b = tl_buf;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+  ctx->ops->tiu_mac(ctx, &p2);
+
+  // output is u8, a > b part merge a < b
+  p2.res_high = tl_buf2;  // dont care add garbage
+  p2.res_low = tl_ofmap;
+  p2.res_is_int8 = 0;  // keep origin
+  p2.a = tl_ifmap2;
+  p2.b_is_const = 1;
+  p2.b_const.val = 1;
+  p2.b_const.is_signed = 0;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+  ctx->ops->tiu_mac(ctx, &p2);
+
+  // restore
+  tl_buf->fmt = tl_buf2->fmt = tl_ifmap->fmt = tl_ifmap2->fmt = tl_ofmap->fmt = fmt;
+
+  return ret;
+}
+int cvm_update_tbl_by_threshold(cvk_context_t* ctx, cvk_tl_t* tl_mask, cvk_tl_t* tl_buf,
+                                cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_update_tbl,
+                                uint8_t threshold_a, uint8_t threshold_b, cvk_tl_t* tl_ofmap) {
+  int ret = 0;
+  (void)threshold_b;
+  (void)tl_mask;
+  (void)tl_buf2;
+  (void)tl_buf3;
+  (void)tl_update_tbl;
+  cvk_fmt_t fmt = tl_ofmap->fmt;
+  if (!check_u8(tl_ofmap, tl_buf, tl_buf)) {
+    // throw config error
+    printf("ofmap/buf format should config CVK_FMT_U8\n");
+    return -1;
+  }
+
+  if (!check_same_fmt(tl_ofmap, tl_buf, tl_buf)) {
+    printf("all tensor's fmt shoud be equal\n");
+    return -2;
+  }
+
+  cvk_tl_t* high = tl_buf;
+  if (tl_ofmap->fmt == CVK_FMT_BF16) {
+    high = NULL;
+  } else if (tl_ofmap->fmt == CVK_FMT_U8) {
+    // hw limitation that input should be i8
+    // cvm_emit_mul_const(ctx, high, high, high->fmt, 0);
+  } else {
+    printf("not support fmt\n");
+    return -3;
+  }
+
+  // mask = get_less_u8(diff[i], thresh_a)
+  _get_less_mask(ctx, high, tl_buf2, tl_update_tbl, threshold_a);
+
+  // mask_1 = get_less_i8(update_tbl[i], thresh_b), 0/1
+  tl_buf2->fmt = tl_ofmap->fmt = tl_buf3->fmt = CVK_FMT_I8;
+  _get_less_mask(ctx, tl_buf2, tl_buf3, tl_ofmap, threshold_b);
+
+  // mask_2 = mask * mask_1 // keep for next triple if-else
+  // tl_update_tbl as buf
+  tl_update_tbl->fmt = tl_buf->fmt = CVK_FMT_I8;
+  cvm_emit_mul(ctx, tl_buf, tl_buf2, tl_update_tbl, tl_buf2->fmt);
+
+  cvm_emit_mul_const(ctx, tl_update_tbl, tl_update_tbl, tl_buf2->fmt, -1);
+  cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, tl_buf3->fmt, 0);  // diff 0 used
+  // update_tbl[i] = update_tbl[i] - mask_2 * update_tbl[i], set 0
+  // sub itself leverage int16 is ok, plz refer \cvm_set_image_by_u8mask
+  cvk_tiu_mac_param_t p2;
+  p2.res_high = tl_buf3;  // diff itsef MUST set high part as 0
+  p2.res_low = tl_ofmap;
+  p2.res_is_int8 = 0;  // keep origin
+  p2.a = tl_update_tbl;
+  p2.b_is_const = 0;
+  p2.b = tl_ofmap;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+  ctx->ops->tiu_mac(ctx, &p2);
+
+  // mask_1 = (mask_1 - 1), hoist it
+  cvk_tiu_mul_param_t p1 = {0};
+  p1.res_high = tl_buf3;
+  p1.res_low = tl_buf2;
+  p1.a = tl_buf2;
+  p1.b_const.val = 1;
+  p1.b_const.is_signed = 1;
+  p1.b_is_const = true;
+  p1.rshift_bits = 0;
+  p1.layer_id = 0;
+  p1.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  cvk_tiu_add_param_t p4;
+  p4.res_high = 0;
+  p4.res_low = tl_buf2;
+  p4.a_high = tl_buf3;
+  p4.a_low = tl_buf2;
+  p4.b_is_const = 1;
+  p4.b.high = 0;
+  p4.b_const.val = -1;
+  p4.b_const.is_signed = 1;
+  p4.rshift_bits = 0;
+  p4.relu_enable = 0;
+  ctx->ops->tiu_add(ctx, &p4);
+
+  // mask_2 = mask * mask_1
+  cvm_emit_mul(ctx, tl_buf, tl_buf2, tl_update_tbl, tl_buf2->fmt);
+
+  // update_tbl[i] = update_tbl[i] + mask_2 // (update_tbl[i]-1)
+  // int8, hoist it
+  p1.res_high = tl_buf3;
+  p1.res_low = tl_ofmap;
+  p1.a = tl_ofmap;
+  p1.b_const.val = 1;
+  p1.b_const.is_signed = 1;
+  p1.b_is_const = true;
+  p1.rshift_bits = 0;
+  p1.layer_id = 0;
+  p1.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  p2.res_high = tl_buf3;
+  p2.res_low = tl_ofmap;
+  p2.res_is_int8 = 1;  // keep origin
+  p2.a = tl_update_tbl;
+  p2.b_is_const = 1;
+  p2.b_const.val = 1;
+  p2.b_const.is_signed = 1;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+  ctx->ops->tiu_mac(ctx, &p2);
+
+  // mask = (mask - 1) * -1 // export, rever 0/1 to 1/0
+  cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, tl_buf3->fmt, 0);  // diff 0 used
+  p4.res_high = 0;
+  p4.res_low = tl_buf;
+  p4.a_high = tl_buf3;
+  p4.a_low = tl_buf;
+  p4.b_is_const = 1;
+  p4.b.high = 0;
+  p4.b_const.val = -1;
+  p4.b_const.is_signed = 1;
+  p4.rshift_bits = 0;
+  p4.relu_enable = 0;
+  ctx->ops->tiu_add(ctx, &p4);
+
+  cvm_emit_mul_const(ctx, tl_buf, tl_buf, tl_buf->fmt, -1);
+
+  // update_tbl[i] = update_tbl[i] + mask // update_tbl[i]++, return
+  // int8, hoist it
+  p1.res_high = tl_buf3;
+  p1.res_low = tl_ofmap;
+  p1.a = tl_ofmap;
+  p1.b_const.val = 1;
+  p1.b_const.is_signed = 1;
+  p1.b_is_const = true;
+  p1.rshift_bits = 0;
+  p1.layer_id = 0;
+  p1.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  p2.res_high = tl_buf3;
+  p2.res_low = tl_ofmap;
+  p2.res_is_int8 = 1;  // keep origin
+  p2.a = tl_buf;
+  p2.b_is_const = 1;
+  p2.b_const.val = 1;
+  p2.b_const.is_signed = 1;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+  ctx->ops->tiu_mac(ctx, &p2);
+
+  // restore
+  tl_buf2->fmt = tl_buf3->fmt = tl_ofmap->fmt = tl_update_tbl->fmt = tl_buf->fmt = fmt;
+
+  return ret;
+}
+
+int cvm_set_image_by_two_info_u8(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf,
+                                 cvk_tl_t* tl_buf2, cvk_tl_t* tl_update_tbl, uint8_t threshold,
+                                 cvk_tl_t* tl_ofmap) {
+  cvk_fmt_t fmt = tl_ifmap->fmt;
+  if (!check_u8(tl_ifmap, tl_buf, tl_update_tbl)) {
+    // throw config error
+    printf("input/buf/tl_update_tbl format should config CVK_FMT_U8\n");
+    return -1;
+  }
+
+  if (!check_same_fmt(tl_ifmap, tl_buf, tl_buf2)) {
+    printf("all tensor's fmt shoud be equal\n");
+    return -2;
+  }
+
+  cvk_tl_t* high = tl_buf;
+  if (tl_ofmap->fmt == CVK_FMT_BF16) {
+    high = NULL;
+  } else if (tl_ofmap->fmt == CVK_FMT_U8) {
+    // hw limitation that input should be i8
+    // tl_buf2->fmt = tl_update_tbl->fmt = tl_ofmap->fmt = tl_buf->fmt = CVK_FMT_I8;
+  } else {
+    printf("not support fmt\n");
+    return -3;
+  }
+
+  // large equal, u8 compare
+  _get_large_mask(ctx, high, tl_buf2, tl_update_tbl, threshold - 1);
+  // return 0;
+
+  // restore
+  tl_buf2->fmt = tl_update_tbl->fmt = tl_buf->fmt = tl_ofmap->fmt = fmt;
+
+  return cvm_set_image_by_u8mask(ctx, tl_ifmap, tl_buf2, high, tl_ofmap);
+}
+
+int cvm_blend_image_by_tbl(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf,
+                           cvk_tl_t* tl_buf2, cvk_tl_t* tl_update_tbl, uint8_t threshold,
+                           uint8_t w1, uint8_t w2, cvk_tl_t* tl_ofmap) {
+  int ret = 0;
+  cvk_fmt_t fmt = tl_ifmap->fmt;
+  if (!check_u8(tl_ifmap, tl_buf, tl_ofmap) || !check_u8(tl_buf2, tl_update_tbl, tl_update_tbl)) {
+    // throw config error
+    printf("input/input1/input2/tl_update_tbl/buf/ofmap format should config CVK_FMT_U8\n");
+    return -1;
+  }
+
+  if (!check_same_fmt(tl_ifmap, tl_buf, tl_update_tbl) &&
+      !check_same_fmt(tl_buf2, tl_update_tbl, tl_ofmap)) {
+    printf("all tensor's fmt shoud be equal\n");
+    return -2;
+  }
+
+  cvk_tl_t* high = tl_buf;
+  if (tl_ofmap->fmt == CVK_FMT_BF16) {
+    // TODO: support it
+    high = NULL;
+  } else if (tl_ofmap->fmt == CVK_FMT_U8) {
+    // hw limitation that input should be i8
+    tl_buf2->fmt = tl_buf->fmt = tl_update_tbl->fmt = CVK_FMT_I8;
+    cvm_emit_mul_const(ctx, high, high, high->fmt, 0);
+  } else {
+    printf("not support fmt\n");
+    return -3;
+  }
+
+  // get g_update_tbl[i]>threshold
+  _get_large_mask(ctx, high, tl_buf2, tl_update_tbl, threshold);
+
+  // dirty tl_update_tbl
+  // TODO: not copy again
+  cvm_emit_mul_const(ctx, high, tl_update_tbl, tl_buf->fmt, 1);
+
+  tl_buf2->fmt = tl_buf->fmt = tl_ofmap->fmt = CVK_FMT_U8;
+  // ofmap * w1, keep high part
+  cvk_tiu_mul_param_t p;
+  p.res_high = tl_buf2;
+  p.res_low = tl_buf;
+  p.a = tl_ofmap;
+  p.b_is_const = 1;
+  p.b_const.val = w1;
+  p.b_const.is_signed = 0;
+  p.rshift_bits = 0;
+  p.relu_enable = 0;
+  ctx->ops->tiu_mul(ctx, &p);
+
+  // buf2 = buf2 + w2*pY[i], i16 output, it should be >= 0
+  cvk_tiu_mac_param_t p2;
+  p2.res_high = tl_buf2;
+  p2.res_low = tl_buf;
+  p2.res_is_int8 = 0;  // keep origin
+  p2.a = tl_ifmap;
+  p2.b_is_const = 1;
+  p2.b_const.val = w2;
+  p2.b_const.is_signed = 0;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+  ctx->ops->tiu_mac(ctx, &p2);
+
+  // keep update_tbl[i]>threshold mask, it ok for signed that just keep data
+  tl_buf2->fmt = tl_update_tbl->fmt = CVK_FMT_I8;
+  cvm_emit_mul(ctx, tl_buf2, tl_update_tbl, tl_buf2, tl_update_tbl->fmt);
+
+  // mul -1 for sub it, dirty tl_update_tbl
+  cvm_emit_mul_const(ctx, tl_update_tbl, tl_update_tbl, tl_update_tbl->fmt, -1);
+
+  high->fmt = tl_ofmap->fmt = CVK_FMT_I8;
+  // NOTICE: we only keep low part as U8
+  // set update_tbl[i]>threshold as 0
+  // sub itself leverage int16 is ok, plz refer \cvm_set_image_by_u8mask
+  p2.res_high = high;
+  p2.res_low = tl_ofmap;
+  p2.res_is_int8 = 0;  // keep origin
+  p2.a = tl_ofmap;
+  p2.b_is_const = 0;
+  p2.b = tl_update_tbl;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+  ctx->ops->tiu_mac(ctx, &p2);
+
+  // ouput as U8, get high part
+  tl_buf->fmt = tl_ofmap->fmt = tl_buf2->fmt = CVK_FMT_U8;
+  p2.res_high = high;  // dont care
+  p2.res_low = tl_ofmap;
+  p2.res_is_int8 = 0;  // keep origin
+  p2.a = tl_buf2;
+  p2.b_is_const = 1;
+  p2.b_const.val = 1;
+  p2.b_const.is_signed = 0;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+  ctx->ops->tiu_mac(ctx, &p2);
+
+  // restore
+  tl_buf2->fmt = tl_ifmap->fmt = tl_buf->fmt = tl_update_tbl->fmt = tl_ofmap->fmt = fmt;
+
+  return ret;
+}
diff --git a/cvimath/src/tiu_lut_atan.c b/cvimath/src/tiu_lut_atan.c
new file mode 100644
index 000000000..5746284fc
--- /dev/null
+++ b/cvimath/src/tiu_lut_atan.c
@@ -0,0 +1,1106 @@
+/**
+ * plz refer [git](https://github.com/xiezhq-hermann/atan_lookup)
+ * input range is `all real numbers` and output range is -pi/2 < x < pi/2,
+ * you can refer [here](https://www.mathopenref.com/arctan.html) for more details
+ */
+//
+// xiezhq@shanghaitech.edu.cn && wanghe@shanghaitech.edu.cn
+/* Reference:
+   [1] Abhisek Ukil, Vishal H Shah, Bernhard Deck,
+   "Fast Computation of arctangent Functions for Embedded Applications: A
+   Comparative Analysis" IEEE International Symposium on Industrial Electronics,
+Pages: 1206 - 1211, DOI: 10.1109/ISIE.2011.5984330, 2011
+[2] Sreeraman Rajan, Sichun Wang, Robert Inkol, and Alain Joyal
+"Efficient Approximations for the Arctangent Function"
+IEEE SIGNAL PROCESSING MAGAZINE [108] MAY 2006
+*/
+
+#include <cvimath_internal.h>
+#include "gen_lut.h"  // NOLINT
+
+//#define DBG
+
+static double LUT_d[102] = {0,
+                            0.00999966668666524,
+                            0.0199973339731505,
+                            0.0299910048568779,
+                            0.0399786871232900,
+                            0.0499583957219428,
+                            0.0599281551212079,
+                            0.0698860016346425,
+                            0.0798299857122373,
+                            0.0897581741899505,
+                            0.0996686524911620,
+                            0.109559526773944,
+                            0.119428926018338,
+                            0.129275004048143,
+                            0.139095941482071,
+                            0.148889947609497,
+                            0.158655262186401,
+                            0.168390157147530,
+                            0.178092938231198,
+                            0.187761946513593,
+                            0.197395559849881,
+                            0.206992194219821,
+                            0.216550304976089,
+                            0.226068387993884,
+                            0.235544980720863,
+                            0.244978663126864,
+                            0.254368058553266,
+                            0.263711834462266,
+                            0.273008703086711,
+                            0.282257421981491,
+                            0.291456794477867,
+                            0.300605670042395,
+                            0.309702944542456,
+                            0.318747560420644,
+                            0.327738506780556,
+                            0.336674819386727,
+                            0.345555580581712,
+                            0.354379919123438,
+                            0.363147009946176,
+                            0.371856073848581,
+                            0.380506377112365,
+                            0.389097231055278,
+                            0.397627991522129,
+                            0.406098058317616,
+                            0.414506874584786,
+                            0.422853926132941,
+                            0.431138740718782,
+                            0.439360887284591,
+                            0.447519975157170,
+                            0.455615653211225,
+                            0.463647609000806,
+                            0.471615567862328,
+                            0.479519291992596,
+                            0.487358579505190,
+                            0.495133263468404,
+                            0.502843210927861,
+                            0.510488321916776,
+                            0.518068528456721,
+                            0.525583793551610,
+                            0.533034110177490,
+                            0.540419500270584,
+                            0.547740013715902,
+                            0.554995727338587,
+                            0.562186743900029,
+                            0.569313191100662,
+                            0.576375220591184,
+                            0.583373006993856,
+                            0.590306746935372,
+                            0.597176658092678,
+                            0.603982978252998,
+                            0.610725964389209,
+                            0.617405891751573,
+                            0.624023052976757,
+                            0.630577757214935,
+                            0.637070329275684,
+                            0.643501108793284,
+                            0.649870449411948,
+                            0.656178717991395,
+                            0.662426293833151,
+                            0.668613567927821,
+                            0.674740942223553,
+                            0.680808828915828,
+                            0.686817649758645,
+                            0.692767835397122,
+                            0.698659824721463,
+                            0.704494064242218,
+                            0.710271007486686,
+                            0.715991114416300,
+                            0.721654850864761,
+                            0.727262687996690,
+                            0.732815101786507,
+                            0.738312572517228,
+                            0.743755584298860,
+                            0.749144624606017,
+                            0.754480183834406,
+                            0.759762754875771,
+                            0.764992832710910,
+                            0.770170914020331,
+                            0.775297496812126,
+                            0.780373080066636,
+                            0.785398163397448,
+                            0.790373246728302};
+
+void cvm_atan_y0(uint16_t* table_data_y0, cvk_tl_shape_t* table_shape) {
+  ASSERT(is_1880v2_tbl_shape(table_shape));
+
+  int table_hw = cvm_table_hw();
+
+  /**
+   * index    0   1    2   3        60 61 62 63 64 65       123 124 125 126
+   *--------
+   *  exp (2) x  -62  -61 -60   ... -3 -2 -1 0  1  2   .... 60  61  62  63
+   *
+   * index    128 129 130 131      188 189 190 191 192 193      251 252 253 254 255
+   *--------
+   *  exp (-2)x   -62 -61 -60  ... -3  -2  -1   0   1   2   ... 60  61  62  63  x
+   *
+   */
+
+  // [0 102) for > 1
+  int lut_sz = sizeof(LUT_d) / sizeof(LUT_d[0]);
+  for (int i = 0; i < lut_sz; i++) {
+    table_data_y0[i] = convert_fp32_bf16(M_PI_2 - LUT_d[i]);
+  }
+
+  // [102 204) for [0 1]
+  for (int i = lut_sz; i < lut_sz * 2; i++) {
+    table_data_y0[i] = convert_fp32_bf16(LUT_d[i - lut_sz]);
+  }
+
+#ifdef DBG
+  for (int i = 0; i < lut_sz * 2; i++) {
+    printf("y0[%d] is %f(0x%x)\n", i, convert_bf16_fp32(table_data_y0[i]), table_data_y0[i]);
+  }
+#endif
+  // duplicate channel #1 to #31
+  // TODO: tensor copy
+  for (uint32_t i = 1; i < table_shape->c; i++) {
+    memcpy(&table_data_y0[i * table_hw], &table_data_y0[0], sizeof(uint16_t) * table_hw);
+  }
+}
+
+void cvm_atan_fast_degree_y0(uint16_t* table_data_y0, cvk_tl_shape_t* table_shape) {
+  ASSERT(is_1880v2_tbl_shape(table_shape));
+
+  int table_hw = cvm_table_hw();
+
+  /**
+   * index    0   1    2   3        60 61 62 63 64 65       123 124 125 126
+   *--------
+   *  exp (2) x  -62  -61 -60   ... -3 -2 -1 0  1  2   .... 60  61  62  63
+   *
+   * index    128 129 130 131      188 189 190 191 192 193      251 252 253 254 255
+   *--------
+   *  exp (-2)x   -62 -61 -60  ... -3  -2  -1   0   1   2   ... 60  61  62  63  x
+   *
+   */
+
+  // [0 102) for > 1
+  int lut_sz = sizeof(LUT_d) / sizeof(LUT_d[0]);
+  for (int i = 0; i < lut_sz; i++) {
+    table_data_y0[i] = convert_fp32_bf16((M_PI_2 - LUT_d[i]) * 180 / M_PI);
+  }
+
+  // [102 204) for [0 1]
+  for (int i = lut_sz; i < lut_sz * 2; i++) {
+    table_data_y0[i] = convert_fp32_bf16(LUT_d[i - lut_sz] * 180 / M_PI);
+  }
+
+#ifdef DBG
+  for (int i = 0; i < lut_sz * 2; i++) {
+    printf("y0[%d] is %f(0x%x)\n", i, convert_bf16_fp32(table_data_y0[i]), table_data_y0[i]);
+  }
+#endif
+  // duplicate channel #1 to #31
+  // TODO: tensor copy
+  for (uint32_t i = 1; i < table_shape->c; i++) {
+    memcpy(&table_data_y0[i * table_hw], &table_data_y0[0], sizeof(uint16_t) * table_hw);
+  }
+}
+
+void cvm_atan_slope(uint16_t* OUT table_slope, cvk_tl_shape_t* table_shape) {
+  int table_hw = cvm_table_hw();
+
+  int lut_sz = sizeof(LUT_d) / sizeof(LUT_d[0]) - 1;
+  for (volatile int i = 0; i < lut_sz; i++) {
+    table_slope[i] = convert_fp32_bf16(LUT_d[i + 1] - LUT_d[i]);
+  }
+
+  // duplicate channel #1 to #31
+  // TODO: tensor copy
+  for (uint64_t i = 1; i < table_shape->c; i++) {
+    memcpy(&table_slope[table_hw * i], &table_slope[0], sizeof(uint16_t) * table_hw);
+  }
+}
+
+// 'bf16_atan_s_01' means atan split [0 1] and (1,
+// data in [0-1] mutilply 1, > 1 mutiply with -1
+void cvm_atan_s_01(uint16_t* OUT table_invert, cvk_tl_shape_t* table_shape) {
+  int half = half_h_table();
+  int table_hw = cvm_table_hw();
+
+  // data in [0, 1], mutilply 1
+#if 1
+  for (uint32_t i = 0; i < 63; i++) {
+    table_invert[i] = convert_fp32_bf16(1.0);
+    table_invert[i + half] = convert_fp32_bf16(1.0);
+  }
+
+  // data > 1
+  for (int i = 63; i < half; i++) {
+    table_invert[i] = convert_fp32_bf16(-1.0);
+    table_invert[i + half] = convert_fp32_bf16(-1.0);
+  }
+#endif
+
+  // duplicate channel #1 to #31
+  // TODO: tensor copy
+  for (uint64_t i = 1; i < table_shape->c; i++) {
+    memcpy(&table_invert[table_hw * i], &table_invert[0], sizeof(uint16_t) * table_hw);
+  }
+}
+
+// 'pos_neg' means data is positive(>=0) is 1 or negtive(<0) is -1
+void cvm_pos_neg_tbl(uint16_t* OUT table_pos_neg, cvk_tl_shape_t* table_shape) {
+  uint32_t half = half_h_table();
+  int table_hw = cvm_table_hw();
+
+  // data >= 0
+  for (uint32_t i = 0; i < half; i++) {
+    table_pos_neg[i] = convert_fp32_bf16(1.0);
+  }
+
+  // data < 0
+  for (uint32_t i = half; i < half * 2; i++) {
+    table_pos_neg[i] = convert_fp32_bf16(-1.0);
+  }
+
+  // duplicate channel #1 to #31
+  // TODO: tensor copy
+  for (uint64_t i = 1; i < table_shape->c; i++) {
+    memcpy(&table_pos_neg[table_hw * i], &table_pos_neg[0], sizeof(uint16_t) * table_hw);
+  }
+}
+
+void cvm_atan_pos_neg(uint16_t* OUT table_pos_neg, cvk_tl_shape_t* table_shape) {
+  cvm_pos_neg_tbl(table_pos_neg, table_shape);
+}
+
+/* Syntactic sugar for get more precision
+ * raw implement code :
+
+ double re_x = 1 / x;
+ int index = round(re_x * 100);
+ return (M_PI_2 - (LUT_d[index] + (re_x * 100 - index) * (LUT_d[index + 1] - LUT_d[index])));
+ and we want to get `(LUT_d[index] + (re_x * 100 - index)` part
+ */
+int cvm_atan_slope_multipilier(cvk_context_t* ctx, cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2,
+                               cvk_tl_t* tl_buf3, cvk_tl_t* tl_ifmap, cvk_tl_t* OUT tl_ofmap_bf16,
+                               cvk_fmt_t fmt) {
+  (void)fmt;
+  cvm_get_dec(ctx, tl_buf, tl_buf2, tl_buf3);
+  // z = (min(x,y) * 100 - index) * slope(index)
+
+  // fill to 100
+  cvk_tiu_mul_param_t p1;
+  p1.res_high = NULL;
+  p1.res_low = tl_buf2;
+  p1.a = tl_buf;
+  p1.b_is_const = 1;
+  p1.b_const.val = convert_fp32_bf16(0);
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  // add
+  cvk_tiu_add_param_t p4;
+  p4.res_high = 0;
+  p4.res_low = tl_buf2;
+  p4.a_high = 0;
+  p4.a_low = tl_buf2;
+  p4.b_is_const = 1;
+  p4.b.high = 0;
+  p4.b_const.val = convert_fp32_bf16(-100.0);
+  p4.rshift_bits = 0;
+  p4.relu_enable = 0;
+
+  ctx->ops->tiu_add(ctx, &p4);
+
+  cvk_tiu_mac_param_t p2;
+  p2.res_high = 0;
+  p2.res_low = tl_buf3;
+  p2.res_is_int8 = 0;
+  p2.a = tl_ifmap;
+  p2.b_is_const = 0;
+  p2.b = tl_buf2;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+
+  ctx->ops->tiu_mac(ctx, &p2);
+
+  p1.res_high = NULL;
+  p1.res_low = tl_ofmap_bf16;
+  p1.a = tl_buf3;
+  p1.b_is_const = 1;
+  p1.b_const.val = convert_fp32_bf16(-1.0);
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  return 0;
+}
+
+/** issue atan >= 0
+ *  \b for more precision, we use mac for atan2
+ *  if (x > 1) {
+ *    x = 1 / x
+ *  }
+ *  int index = round(x * 100);
+ *  double r = (x * 100 - index) * (LUT_d[index + 1] - LUT_d[index]);
+ *  double shift = LUT_d[index];
+ *  if (x > 1) {
+ *    shift = M_PI_2 - LUT_d[index];
+ *  }
+ *  return r + shift;
+ *  FIXME: reduce temp buffer count
+ */
+int _cvm_atan_emit(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2,
+                   cvk_tl_t* tl_buf3, cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_slope_buf,
+                   cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_buf, cvk_tl_t* tl_table_answer,
+                   cvk_tl_t* tl_table_answer_mantissa, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt,
+                   float b) {
+  cvm_table_check(tl_ifmap, tl_y0_buf, tl_slope_buf, tl_ifmap);
+  cvm_table_check(tl_buf, tl_invert_buf, tl_pos_neg_buf, tl_buf2);
+  cvm_table_check(tl_buf3, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16);
+
+  // x = abs(x0)
+  // y = 1 / x
+  // index = 100 * min(x, y)
+  // z = (min(x,y) * 100 - index) * slope(index)
+  // invert = table_0_102(x) ([0-1] return 1, >1 return -1)
+  // invert = invert * z
+  // t = 64 * (table_0_102 + 1)
+  // shift_index = t(index) ([0-1] return 102, >1 return 0)
+  // shift = y0(shift_index + index) ([0-1] return  LUT_d[index], >1 return M_PI_2 - LUT_d[index]
+  // p = table_pos_neg(x0) (>= 0 return 1, < 0 return -1)
+  // p(shift + invert * z)
+
+  cvm_emit_abs(ctx, tl_ifmap, tl_buf, fmt);
+
+  // y = 1 / x
+  cvm_emit_reciprocal(ctx, tl_buf, tl_buf2, tl_table_answer, tl_table_answer_mantissa,
+                      tl_ofmap_bf16);
+
+  cvk_tiu_min_param_t p7;
+  p7.min = tl_ofmap_bf16;
+  p7.a = tl_buf;
+  p7.b_is_const = 0;
+  p7.b = tl_ofmap_bf16;
+
+  ctx->ops->tiu_min(ctx, &p7);
+
+  // get index
+  cvk_tiu_mul_param_t p1;
+  p1.res_high = NULL;
+  p1.res_low = tl_buf;
+  p1.a = tl_ofmap_bf16;
+  p1.b_is_const = 1;
+  p1.b_const.val = convert_fp32_bf16(100.0);
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  cvm_atan_slope_multipilier(ctx, tl_buf, tl_buf2, tl_buf3, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+
+  // get int8 index of x2
+  cvm_get_uint8_t_tbl_idx(ctx, tl_buf, tl_buf2);
+
+  // x0 = base[x2] + (0.x * (slope[x2])
+  // TODO: use mac
+
+  // get slope[x2]
+  cvk_tiu_lookup_table_param_t p12;
+  p12.ofmap = tl_buf3;
+  p12.ifmap = tl_buf2;
+  p12.table = tl_slope_buf;
+  ctx->ops->tiu_lookup_table(ctx, &p12);
+
+  // z = (min(x,y) * 100 - index) * slope(index)
+  p1.res_high = NULL;
+  p1.res_low = tl_ofmap_bf16;
+  p1.a = tl_ofmap_bf16;
+  p1.b_is_const = 0;
+  p1.b = tl_buf3;
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  // get index from exp,
+  // mv_lut_base get exp as index, remove mantissa
+  cvk_tdma_l2l_tensor_copy_param_t p10;
+  p10.dst = tl_buf3;
+  p10.src = tl_ifmap;
+  p10.mv_lut_base = false;
+  p10.mv_lut_idx = true;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+  p10.mv_lut_idx = false;
+
+  // invert = table_0_102(x) ([0-1] return 1, >1 return -1)
+  p12.ofmap = tl_buf3;
+  p12.ifmap = tl_buf3;
+  p12.table = tl_invert_buf;
+  ctx->ops->tiu_lookup_table(ctx, &p12);
+
+  // z = invert * z
+  p1.res_high = NULL;
+  p1.res_low = tl_ofmap_bf16;
+  p1.a = tl_ofmap_bf16;
+  p1.b_is_const = 0;
+  p1.b = tl_buf3;
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  // t = 51 * (invert + 1), -> invert + 1
+  cvk_tiu_add_param_t p4;
+  p4.res_high = 0;
+  p4.res_low = tl_buf3;
+  p4.a_high = 0;
+  p4.a_low = tl_buf3;
+  p4.b_is_const = 1;
+  p4.b.high = 0;
+  p4.b_const.val = convert_fp32_bf16(1.0);
+  p4.rshift_bits = 0;
+  p4.relu_enable = 0;
+
+  ctx->ops->tiu_add(ctx, &p4);
+
+  // t = 51 * (invert + 1)
+  p1.res_high = NULL;
+  p1.res_low = tl_buf3;
+  p1.a = tl_buf3;
+  p1.b_is_const = 1;
+  p1.b_const.val = convert_fp32_bf16(51.0);
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p1);
+
+#if 1
+  // avoid rounding, we first round org index
+  cvm_get_uint8_t_tbl_idx(ctx, tl_buf, tl_buf2);
+  tl_buf2->fmt = CVK_FMT_U8;
+  cvk_tl_shape_t t = tl_buf2->shape;
+  cvk_tl_stride_t s = tl_buf2->stride;
+  tl_buf2->shape.h = tl_buf2->shape.h * tl_buf2->shape.w;
+  tl_buf2->shape.w = 1;
+  tl_buf2->stride.h = 2;
+  tl_buf2->stride.c = tl_buf2->shape.h * tl_buf2->shape.w;
+  tl_buf2->stride.c = tl_buf2->shape.c * tl_buf2->shape.h * tl_buf2->shape.w;
+  p10.dst = tl_buf;
+  p10.src = tl_buf2;
+  p10.mv_lut_base = false;
+  p10.mv_lut_idx = false;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+  tl_buf2->fmt = CVK_FMT_BF16;
+  tl_buf2->shape = t;
+  tl_buf2->stride = s;
+#else
+#endif
+  // t = t + index
+  p4.res_high = 0;
+  p4.res_low = tl_buf3;
+  p4.a_high = 0;
+  p4.a_low = tl_buf3;
+  p4.b_is_const = 0;
+  p4.b.high = 0;
+  p4.b.low = tl_buf;
+  p4.rshift_bits = 0;
+  p4.relu_enable = 0;
+
+  ctx->ops->tiu_add(ctx, &p4);
+
+  // get int8 index for lut
+  cvm_get_uint8_t_tbl_idx(ctx, tl_buf3, tl_buf);
+
+  // shift = y0(t) ([0-1] return  LUT_d[index], >1 return M_PI_2 - LUT_d[index]
+  p12.ofmap = tl_buf3;
+  p12.ifmap = tl_buf;
+  p12.table = tl_y0_buf;
+  ctx->ops->tiu_lookup_table(ctx, &p12);
+
+  // z = base[x2] + (0.x * (slope[x2])
+  p4.res_high = 0;
+  p4.res_low = tl_buf2;
+  p4.a_high = 0;
+  p4.a_low = tl_ofmap_bf16;
+  p4.b_is_const = 0;
+  p4.b.high = 0;
+  p4.b.low = tl_buf3;
+  p4.rshift_bits = 0;
+  p4.relu_enable = 0;
+
+  ctx->ops->tiu_add(ctx, &p4);
+
+  // get pos neg, use mv_lut_idx
+  p10.dst = tl_buf3;
+  p10.src = tl_ifmap;
+  p10.mv_lut_base = false;
+  p10.mv_lut_idx = true;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+  p10.mv_lut_idx = false;
+
+  // p = table_pos_neg(x0) (>= 0 return 1, < 0 return -1)
+  p12.ofmap = tl_buf3;
+  p12.ifmap = tl_buf3;
+  p12.table = tl_pos_neg_buf;
+  ctx->ops->tiu_lookup_table(ctx, &p12);
+#if 0
+  // p * z
+  p1.res_high = NULL;
+  p1.res_low = tl_ofmap_bf16;
+  p1.a = tl_buf2;
+  p1.b_is_const = 0;
+  p1.b =  tl_buf3;
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p1);
+#else
+
+  // add pi/-pi for atan2
+  cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 0.0);
+  cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, b);
+
+  // p * z + pi
+  cvk_tiu_mac_param_t p2;
+  p2.res_high = 0;
+  p2.res_low = tl_ofmap_bf16;
+  p2.res_is_int8 = 0;
+  p2.a = tl_buf2;
+  p2.b_is_const = 0;
+  p2.b = tl_buf3;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+
+  ctx->ops->tiu_mac(ctx, &p2);
+#endif
+
+  return 0;
+}
+
+int cvm_atan_emit(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2,
+                  cvk_tl_t* tl_buf3, cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_slope_buf,
+                  cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_buf, cvk_tl_t* tl_table_answer,
+                  cvk_tl_t* tl_table_answer_mantissa, cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  return _cvm_atan_emit(ctx, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, tl_slope_buf,
+                        tl_invert_buf, tl_pos_neg_buf, tl_table_answer, tl_table_answer_mantissa,
+                        tl_ofmap_bf16, fmt, 0.0);
+}
+
+/**
+ * \table_data_atan_slope is optional, NULL for not assign it
+ */
+void cvm_atan_tbl(uint16_t* table_data_atan_y0, uint16_t* table_data_atan_slope,
+                  uint16_t* table_data_atan_invert, uint16_t* table_data_atan_pos_neg,
+                  cvk_tl_shape_t* table_shape) {
+  ASSERT(table_data_atan_y0);
+  // ASSERT(table_data_atan_slope);
+  ASSERT(table_data_atan_invert);
+  ASSERT(table_data_atan_pos_neg);
+  ASSERT(table_shape);
+
+  cvm_atan_y0(table_data_atan_y0, table_shape);
+  if (table_data_atan_slope) {
+    cvm_atan_slope(table_data_atan_slope, table_shape);
+  }
+  cvm_atan_s_01(table_data_atan_invert, table_shape);
+  cvm_pos_neg_tbl(table_data_atan_pos_neg, table_shape);
+}
+
+void cvm_atan_fast_degree_tbl(uint16_t* table_data_atan_y0, uint16_t* table_data_atan_invert,
+                              uint16_t* table_data_atan_pos_neg, cvk_tl_shape_t* table_shape) {
+  ASSERT(table_data_atan_y0);
+  ASSERT(table_data_atan_invert);
+  ASSERT(table_data_atan_pos_neg);
+  ASSERT(table_shape);
+
+  cvm_atan_fast_degree_y0(table_data_atan_y0, table_shape);
+  cvm_atan_s_01(table_data_atan_invert, table_shape);
+  cvm_pos_neg_tbl(table_data_atan_pos_neg, table_shape);
+}
+
+/** issue atan >= 0
+ *  for fast version, we discard slope
+ *  tl_y0_buf[0-102) put 'LUT[index]', [102-204) for 'M_PI_2 - LUT[index]'
+ */
+int _cvm_atan_fast_emit(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2,
+                        cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_buf,
+                        cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa,
+                        cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, float b, bool is_dirty_ifmap) {
+  cvm_table_check(tl_ifmap, tl_y0_buf, tl_y0_buf, tl_ifmap);
+  cvm_table_check(tl_buf, tl_invert_buf, tl_pos_neg_buf, tl_buf);
+  cvm_table_check(tl_buf, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16);
+
+  cvk_tiu_lookup_table_param_t p12;
+  cvk_tdma_l2l_tensor_copy_param_t p10;
+
+  // plz refer https://github.com/xiezhq-hermann/atan_lookup/blob/master/atan.cpp
+  // for faster version
+  cvm_emit_abs(ctx, tl_ifmap, tl_buf, fmt);
+
+  // y = 1 / x
+  _cvm_lut_exp_mantissa(ctx, tl_buf, NULL, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16,
+                        true);
+
+  // once again cuz recipical's input dirtied
+  cvm_emit_abs(ctx, tl_ifmap, tl_buf, fmt);
+
+  cvk_tiu_min_param_t p7;
+  p7.min = tl_buf;
+  p7.a = tl_buf;
+  p7.b_is_const = 0;
+  p7.b = tl_ofmap_bf16;
+
+  ctx->ops->tiu_min(ctx, &p7);
+
+  // get index
+  cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, 100.0);
+
+  // get index from exp,
+  // mv_lut_base get exp as index, remove mantissa
+  p10.dst = tl_ofmap_bf16;
+  p10.src = tl_ifmap;
+  p10.mv_lut_base = false;
+  p10.mv_lut_idx = true;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+  p10.mv_lut_idx = false;
+
+  cvk_tl_t* tmp = tl_buf2;
+  if (is_dirty_ifmap) {
+    tmp = tl_ifmap;
+  }
+
+  // get pos neg, use mv_lut_idx
+  p10.dst = tmp;
+  p10.src = tl_ifmap;
+  p10.mv_lut_base = false;
+  p10.mv_lut_idx = true;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+  p10.mv_lut_idx = false;
+
+  // p = table_pos_neg(x0) (>= 0 return 1, < 0 return -1)
+  p12.ofmap = tmp;
+  p12.ifmap = tmp;
+  p12.table = tl_pos_neg_buf;
+  ctx->ops->tiu_lookup_table(ctx, &p12);
+
+  // get index of LUT[index] or (M_PI_2 - LUT[index])
+  {
+    // invert = table_0_102(x) ([0-1] return 1, >1 return -1)
+    p12.ofmap = tl_ofmap_bf16;
+    p12.ifmap = tl_ofmap_bf16;
+    p12.table = tl_invert_buf;
+    ctx->ops->tiu_lookup_table(ctx, &p12);
+
+    cvk_tl_t* out = tl_buf;
+#if 1
+    // t = 51 * (invert + 1), -> invert + 1
+    cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0);
+
+    // t = 51 * (invert + 1)
+    cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 51.0);
+
+    // t = t + index
+    cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+
+    // get int8 index for lut
+    // cvm_get_uint8_t_tbl_idx(ctx, tl_ofmap_bf16, tl_buf);
+    //_cvm_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, CVK_FMT_U8, 0);
+
+    //// shift = y0(t) ([0-1] return  LUT_d[index], >1 return M_PI_2 - LUT_d[index]
+    // p12.ofmap = tl_buf;
+    // p12.ifmap = tl_buf;
+    // p12.table = tl_y0_buf;
+    // ctx->ops->tiu_lookup_table(ctx, &p12);
+
+    _cvm_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, CVK_FMT_U8, 0);
+
+#else
+    // index, output is uint8 format
+    _cvm_get_tbl_idx(ctx, tl_buf, tl_buf, CVK_FMT_U8, 0);
+
+    // mask value from bf16 -> int8, we add as bf16
+    // int8 format (51*(mask + 1) + index) is real remap index for table
+    // mask = mask + 1
+    cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0);
+
+    // mask = 51 * mask
+    cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 51.0);
+
+    // mask value change to int8 format for lut
+    _cvm_get_tbl_idx(ctx, tl_ofmap_bf16, tl_ofmap_bf16, CVK_FMT_U8, 0);
+
+    // int8 format (51*(mask) + index) is real remap index for table
+    if (1) {
+      cvk_tl_t index_uint8_t, mask_uint8_t, fake_uint8_t, out_uint8_t;
+      bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &index_uint8_t, tl_buf, CVK_FMT_U8);
+      bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &out_uint8_t, tl_buf, CVK_FMT_U8);
+      bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &mask_uint8_t, tl_ofmap_bf16, CVK_FMT_U8);
+      bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &fake_uint8_t, tmp, CVK_FMT_U8);
+      // fake_uint8_t.start_address =
+      //    ctx->info.lmem_size - (fake_uint8_t.shape.h * fake_uint8_t.shape.w);
+      //    //tl_buf->start_address + 1;
+      //    //tl_ifmap->start_address;
+
+      // mask + index
+      // its safe we only need low part value, so we give fake high part
+
+      cvk_tl_t* a = bmk1880v2_lmem_alloc_tensor(ctx, out_uint8_t.shape, CVK_FMT_U8, CTRL_NULL);
+#if 1
+      cvk_tiu_add_param_t p4;
+      p4.res_high = 0;
+      // p4.res_low = &mask_uint8_t;
+      p4.res_low = &index_uint8_t;
+      p4.a_high = a;
+      p4.a_low = &index_uint8_t;
+      p4.b_is_const = 0;
+      p4.b.high = a;
+      p4.b.low = &mask_uint8_t;
+      p4.rshift_bits = 0;
+      p4.relu_enable = 0;
+
+      ctx->ops->tiu_add(ctx, &p4);
+      // out = tl_ofmap_bf16;
+#else
+      {
+        cvk_tiu_mul_param_t p;
+        p.res_high = NULL;
+        p.res_low = a;
+        p.a = a;
+        p.b_is_const = 1;
+        p.b_const.val = 0;
+        p.b_const.is_signed = 0;
+        p.rshift_bits = 0;
+        p.relu_enable = 0;
+        ctx->ops->tiu_mul(ctx, &p);
+      }
+
+      out = tl_ofmap_bf16;
+      cvk_tiu_mac_param_t p2;
+      p2.res_high = a;
+      p2.res_low = &mask_uint8_t;
+      p2.res_is_int8 = 0;
+      p2.a = &index_uint8_t;
+      p2.b_is_const = 1;
+      p2.b = 0;
+      p2.b_const.val = 1;
+      p2.b_const.is_signed = 0;
+      p2.lshift_bits = 0;
+      p2.rshift_bits = 0;
+      p2.relu_enable = 0;
+      ctx->ops->tiu_mac(ctx, &p2);
+#endif
+      bmk1880v2_lmem_free_tensor(ctx, a);
+    } else {
+      // move bak to bf16
+      // cvk_tl_t index_uint8_t, mask_uint8_t;
+      // bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &index_uint8_t, tl_buf, CVK_FMT_U8);
+      // bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &mask_uint8_t, tl_ofmap_bf16, CVK_FMT_U8);
+
+      // p10.dst = tl_buf;
+      // p10.src = &index_uint8_t;
+      // p10.mv_lut_base = false;
+      // p10.mv_lut_idx = false;
+      // ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+
+      // p10.dst = tl_ofmap_bf16;
+      // p10.src = &mask_uint8_t;
+      // ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+
+      // cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+      //_cvm_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, CVK_FMT_U8, 0);
+    }
+#endif
+
+    // shift = y0(t) ([0-1] return  LUT_d[index], >1 return M_PI_2 - LUT_d[index]
+    p12.ofmap = out;
+    p12.ifmap = out;
+    p12.table = tl_y0_buf;
+    ctx->ops->tiu_lookup_table(ctx, &p12);
+  }
+
+#if 0
+  // add pi/-pi for atan2
+  cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 0.0);
+  cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, b);
+
+  // p * z + pi
+  cvk_tiu_mac_param_t p2;
+  p2.res_high = 0;
+  p2.res_low = tl_ofmap_bf16;
+  p2.res_is_int8 = 0;
+  p2.a = tl_buf;
+  p2.b_is_const = 0;
+  p2.b =  tmp;
+  p2.lshift_bits = 0;//lshift_bits;
+  p2.rshift_bits = 0;//rshift_bits;
+  p2.relu_enable = 0;
+
+  ctx->ops->tiu_mac(ctx, &p2);
+#else
+  cvm_emit_mul(ctx, tl_buf, tmp, tl_ofmap_bf16, fmt);
+  cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, b);
+#endif
+
+  return 0;
+}
+
+/**
+ * \brief using \tl_buf2 as temp buffer for uint8_t add
+ * \NOTICE: it dirties input: \tl_ifmap
+ */
+int __cvm_atan_fast_emit(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf,
+                         cvk_tl_t* tl_buf2, cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_invert_buf,
+                         cvk_tl_t* tl_pos_neg_buf, cvk_tl_t* tl_table_answer,
+                         cvk_tl_t* tl_table_answer_mantissa, cvk_tl_t* OUT tl_ofmap_bf16,
+                         cvk_fmt_t fmt) {
+  cvm_table_check(tl_ifmap, tl_y0_buf, tl_y0_buf, tl_ifmap);
+  cvm_table_check(tl_buf, tl_invert_buf, tl_pos_neg_buf, tl_buf);
+  cvm_table_check(tl_buf, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16);
+
+  cvk_tiu_lookup_table_param_t p12;
+
+  // plz refer https://github.com/xiezhq-hermann/atan_lookup/blob/master/atan.cpp
+  // for faster version
+  cvm_emit_abs(ctx, tl_ifmap, tl_buf, fmt);
+
+  // y = 1 / x
+  _cvm_lut_exp_mantissa(ctx, tl_buf, NULL, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16,
+                        true);
+
+  // once again cuz recipical's input dirtied
+  cvm_emit_abs(ctx, tl_ifmap, tl_buf, fmt);
+
+  cvk_tiu_min_param_t p7;
+  p7.min = tl_buf;
+  p7.a = tl_buf;
+  p7.b_is_const = 0;
+  p7.b = tl_ofmap_bf16;
+
+  ctx->ops->tiu_min(ctx, &p7);
+
+  // get index
+  cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, 100.0);
+
+  // get index from exp,
+  // mv_lut_base get exp as index, remove mantissa
+#if 1
+  cvk_tdma_l2l_tensor_copy_param_t p10;
+  p10.dst = tl_ofmap_bf16;
+  p10.src = tl_ifmap;
+  p10.mv_lut_base = false;
+  p10.mv_lut_idx = true;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+  p10.mv_lut_idx = false;
+#else
+  cvm_emit_abs(ctx, tl_ifmap, tl_ofmap_bf16, fmt);
+  cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 0.5);
+#endif
+
+  cvk_tl_t* tmp = tl_buf2;
+  tmp = tl_ifmap;
+
+#if 0
+  // get pos neg, use mv_lut_idx
+  p10.dst = tmp;
+  p10.src = tl_ifmap;
+  p10.mv_lut_base = false;
+  p10.mv_lut_idx = true;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+  p10.mv_lut_idx = false;
+
+  // p = table_pos_neg(x0) (>= 0 return 1, < 0 return -1)
+  p12.ofmap = tmp;
+  p12.ifmap = tmp;
+  p12.table = tl_pos_neg_buf;
+  ctx->ops->tiu_lookup_table(ctx, &p12);
+  //p12.ofmap = tl_ofmap_bf16;
+  //p12.ifmap = tmp;
+  //p12.table = tl_pos_neg_buf;
+  //ctx->ops->tiu_lookup_table(ctx, &p12);
+  //return 0;
+#else
+  // dirty input is ok
+  cvk_tl_t index_i8;
+  bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_buf2, CVK_FMT_I8);
+  cvm_emit_mask_ge0_lt0(ctx, tmp, &index_i8, tmp, fmt);
+  // cvm_emit_mask_ge0_lt0(ctx, tmp, &index_i8, tl_ofmap_bf16, fmt);
+  // return 0;
+#endif
+
+  // get index of LUT[index] or (M_PI_2 - LUT[index])
+  {
+#if 1
+    // invert = table_0_102(x) ([0-1] return 1, >1 return -1)
+    p12.ofmap = tl_ofmap_bf16;
+    p12.ifmap = tl_ofmap_bf16;
+    p12.table = tl_invert_buf;
+    ctx->ops->tiu_lookup_table(ctx, &p12);
+#else
+    {
+      cvk_tl_t index_i8;
+      bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_buf2, CVK_FMT_I8);
+      // 1. abs
+      // 2. add 0.5 to round bf16->int8
+      // 3. leave (0,1) and others, rightshift 1 to get 0, others
+      // 4. saturate to int max, and transform from int8 to bf16
+
+      // cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, convert_bf16_fp32(0x7f00));
+      cvk_tdma_l2l_tensor_copy_param_t p1;
+      p1.src = tl_ofmap_bf16;
+      p1.dst = &index_i8;
+      ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+
+      cvk_tiu_mul_param_t p;
+      p.res_high = NULL;
+      p.res_low = &index_i8;
+      p.a = &index_i8;
+      p.b_is_const = 1;
+      p.b_const.val = 1;
+      p.b_const.is_signed = 0;
+      p.rshift_bits = 1;
+      p.relu_enable = 0;
+
+      ctx->ops->tiu_mul(ctx, &p);
+
+      // p.res_high = NULL;
+      // p.res_low = &index_i8;
+      // p.a = &index_i8;
+      // p.b_is_const = 1;
+      // p.b_const.val =-1;
+      // p.b_const.is_signed = 1;
+      // p.rshift_bits = 7;
+      // p.relu_enable = 0;
+
+      // ctx->ops->tiu_mul(ctx, &p);
+
+      p1.src = &index_i8;
+      p1.dst = tl_ofmap_bf16;
+      ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+      return 0;
+
+      // cvm_emit_mask_eq_0(ctx, tl_ofmap_bf16, tl_ofmap_bf16, &index_i8, tl_ofmap_bf16, fmt);
+      cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 2.0);
+      cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0);
+    }
+#endif
+
+    cvk_tl_t* out = tl_buf;
+#if 0
+    // t = 51 * (invert + 1), -> invert + 1
+    cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0);
+
+    // t = 51 * (invert + 1)
+    cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 51.0);
+
+    // t = t + index
+    cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+
+    // get int8 index for lut
+    //bf16_get_uint8_t_tbl_idx(ctx, tl_ofmap_bf16, tl_buf);
+    //_cvm_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, CVK_FMT_U8, 0);
+
+    //// shift = y0(t) ([0-1] return  LUT_d[index], >1 return M_PI_2 - LUT_d[index]
+    //p12.ofmap = tl_buf;
+    //p12.ifmap = tl_buf;
+    //p12.table = tl_y0_buf;
+    //ctx->ops->tiu_lookup_table(ctx, &p12);
+
+    _cvm_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, CVK_FMT_U8, 0);
+
+#else
+    // index, output is uint8 format
+    _cvm_get_tbl_idx(ctx, tl_buf, tl_buf, CVK_FMT_U8, 0);
+
+    // mask value from bf16 -> int8, we add as bf16
+    // int8 format (51*(mask + 1) + index) is real remap index for table
+    // mask = mask + 1
+    cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0);
+
+    // mask = 51 * mask
+    cvm_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 51.0);
+
+    // mask value change to int8 format for lut
+    _cvm_get_tbl_idx(ctx, tl_ofmap_bf16, tl_ofmap_bf16, CVK_FMT_U8, 0);
+
+    // int8 format (51*(mask) + index) is real remap index for table
+    if (1) {
+      cvk_tl_t index_uint8_t, mask_uint8_t, fake_uint8_t, out_uint8_t;
+      bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &index_uint8_t, tl_buf, CVK_FMT_U8);
+      bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &out_uint8_t, tl_buf, CVK_FMT_U8);
+      bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &mask_uint8_t, tl_ofmap_bf16, CVK_FMT_U8);
+      bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &fake_uint8_t, tl_buf2, CVK_FMT_U8);
+#if 0
+      // mask + index
+      // its safe we only need low part value, so we give fake high part
+      cvk_tiu_add_param_t p4;
+      p4.res_high = 0;
+      p4.res_low = &index_uint8_t;
+      p4.a_high = &fake_uint8_t;
+      p4.a_low = &index_uint8_t;
+      p4.b_is_const = 0;
+      p4.b.high = &fake_uint8_t;
+      p4.b.low = &mask_uint8_t;
+      p4.rshift_bits = 0;
+      p4.relu_enable = 0;
+
+      ctx->ops->tiu_add(ctx, &p4);
+#else
+      cvk_tiu_mac_param_t p2;
+      p2.res_high = &fake_uint8_t;
+      p2.res_low = &index_uint8_t;
+      p2.res_is_int8 = 0;
+      p2.a = &mask_uint8_t;
+      p2.b_is_const = 1;
+      p2.b = 0;
+      p2.b_const.val = 1;
+      p2.b_const.is_signed = 0;
+      p2.lshift_bits = 0;
+      p2.rshift_bits = 0;
+      p2.relu_enable = 0;
+
+      ctx->ops->tiu_mac(ctx, &p2);
+#endif
+
+    } else {
+      // move bak to bf16
+      // cvk_tl_t index_uint8_t, mask_uint8_t;
+      // bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &index_uint8_t, tl_buf, CVK_FMT_U8);
+      // bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &mask_uint8_t, tl_ofmap_bf16, CVK_FMT_U8);
+
+      // p10.dst = tl_buf;
+      // p10.src = &index_uint8_t;
+      // p10.mv_lut_base = false;
+      // p10.mv_lut_idx = false;
+      // ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+
+      // p10.dst = tl_ofmap_bf16;
+      // p10.src = &mask_uint8_t;
+      // ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+
+      // cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+      //_cvm_get_tbl_idx(ctx, tl_ofmap_bf16, tl_buf, CVK_FMT_U8, 0);
+    }
+#endif
+
+    // shift = y0(t) ([0-1] return  LUT_d[index], >1 return M_PI_2 - LUT_d[index]
+    p12.ofmap = out;
+    p12.ifmap = out;
+    p12.table = tl_y0_buf;
+    ctx->ops->tiu_lookup_table(ctx, &p12);
+  }
+
+  cvm_emit_mul(ctx, tl_buf, tmp, tl_ofmap_bf16, fmt);
+
+  return 0;
+}
+
+int cvm_atan_fast_emit(cvk_context_t* ctx, cvk_tl_t* tl_ifmap, cvk_tl_t* tl_buf, cvk_tl_t* tl_buf2,
+                       cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_buf,
+                       cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa,
+                       cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, bool is_dirty_ifmap) {
+  return _cvm_atan_fast_emit(ctx, tl_ifmap, tl_buf, tl_buf2, tl_y0_buf, tl_invert_buf,
+                             tl_pos_neg_buf, tl_table_answer, tl_table_answer_mantissa,
+                             tl_ofmap_bf16, fmt, 0.0, is_dirty_ifmap);
+}
diff --git a/cvimath/src/tiu_lut_atan2.c b/cvimath/src/tiu_lut_atan2.c
new file mode 100644
index 000000000..36a6e422f
--- /dev/null
+++ b/cvimath/src/tiu_lut_atan2.c
@@ -0,0 +1,787 @@
+/**
+ * \brirf implement with atan, plz refer https://en.wikipedia.org/wiki/Atan2
+ * NOTICE: current epsilon set to 0.1
+ */
+#include <cvimath_internal.h>
+#include "gen_lut.h"  // NOLINT
+
+//#define DBG
+
+static void _cvm_atan2_emit_case_3(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf,
+                                   cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_buf4,
+                                   cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_slope_buf,
+                                   cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table,
+                                   cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa,
+                                   cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, float b) {
+  // case 3
+  // atan( y / x)
+
+  // x0 = reciprocal(x)
+  cvm_emit_reciprocal(ctx, x, tl_buf2, tl_table_answer, tl_table_answer_mantissa, tl_buf);
+
+  // y0 = x0 * y
+  cvk_tiu_mul_param_t p1;
+  p1.res_high = NULL;
+  p1.res_low = tl_buf4;
+  p1.a = y;
+  p1.b_is_const = 0;
+  p1.b = tl_buf;
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  // x0 = atan(y0)
+  _cvm_atan_emit(ctx, tl_buf4, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, tl_slope_buf, tl_invert_buf,
+                 tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa, OUT tl_ofmap_bf16,
+                 fmt, b);
+}
+
+static void cvm_atan2_emit_case_3(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf,
+                                  cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_buf4,
+                                  cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_slope_buf,
+                                  cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table,
+                                  cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa,
+                                  cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  _cvm_atan2_emit_case_3(ctx, y, x, tl_buf, tl_buf2, tl_buf3, tl_buf4, tl_y0_buf, tl_slope_buf,
+                         tl_invert_buf, tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa,
+                         tl_ofmap_bf16, fmt, 0.0);
+}
+
+// NOTICE: it could dirty \y
+/**
+ * atan2(y, x) should express 4 condition using atan express from
+ * [here](https://en.wikipedia.org/wiki/Atan2)
+ */
+void cvm_atan2_emit(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf,
+                    cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_buf4, cvk_tl_t* tl_buf5,
+                    cvk_tl_t* tl_buf6, cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_slope_buf,
+                    cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table, cvk_tl_t* tl_table_answer,
+                    cvk_tl_t* tl_table_answer_mantissa, cvk_tl_t* tl_sqrt_table_answer,
+                    cvk_tl_t* tl_sqrt_table_answer_mantissa, cvk_tl_t* tl_0_idx_table,
+                    cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  cvm_table_check(y, tl_y0_buf, tl_slope_buf, x);
+  cvm_table_check(tl_buf, tl_invert_buf, tl_pos_neg_table, tl_buf2);
+  cvm_table_check(tl_buf3, tl_table_answer, tl_table_answer_mantissa, tl_buf4);
+  cvm_table_check(tl_buf6, tl_table_answer, tl_0_idx_table, tl_buf5);
+  cvm_table_check(y, tl_sqrt_table_answer, tl_sqrt_table_answer_mantissa, x);
+
+  // atan(y/x), x > 0
+  // atan(y/x) + PI , x < 0 and y >= 0
+  // atan(y/x) - PI , x < 0 and y < 0
+  // pi / 2, x = 0 and y > 0
+  // -pi / 2, x = 0 and y < 0
+  // 0, x = 0 and y = 0
+
+  // atan(y/x), x > 0
+  cvm_emit_max_const(ctx, x, tl_buf4, fmt, 0.0);
+  cvm_atan2_emit_case_3(ctx, y, tl_buf4, tl_buf, tl_buf2, tl_buf3, tl_buf5, tl_y0_buf, tl_slope_buf,
+                        tl_invert_buf, tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa,
+                        tl_ofmap_bf16, fmt);
+
+  // x > 0
+  cvm_emit_mask_gt0(ctx, x, tl_buf, tl_buf3, tl_buf4, tl_pos_neg_table, tl_0_idx_table, tl_buf2,
+                    fmt);
+
+  cvm_emit_mul(ctx, tl_ofmap_bf16, tl_buf2, tl_ofmap_bf16, fmt);
+
+  // atan(y/x) + PI , x < 0 and y >= 0
+  cvm_emit_min_const(ctx, x, tl_buf4, fmt, 0.0);
+  _cvm_atan2_emit_case_3(ctx, y, tl_buf4, tl_buf, tl_buf2, tl_buf3, tl_buf5, tl_y0_buf,
+                         tl_slope_buf, tl_invert_buf, tl_pos_neg_table, tl_table_answer,
+                         tl_table_answer_mantissa, tl_buf6, fmt, M_PI);
+  // cvm_emit_add_const(ctx, tl_buf6, tl_buf6, fmt, M_PI);
+
+  // get index map that x < 0 and y >= 0
+  // !(y >= 0) = !(y < 0)
+#if 0
+  cvm_emit_pos_idx(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt);
+  // y == 0
+  cvm_emit_0_idx(ctx, y, tl_buf, tl_0_idx_table, tl_buf2, fmt);
+  cvm_emit_add(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
+#else
+  // y >= 0
+  cvm_emit_mask_ge0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf2, fmt);
+#endif
+  // x < 0
+  cvm_emit_mask_lt0(ctx, x, tl_buf, tl_pos_neg_table, tl_buf3, fmt);
+  // x < 0 && y >= 0
+  cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
+
+  cvm_emit_mul(ctx, tl_buf6, tl_buf2, tl_buf, fmt);
+  cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+
+  // atan(y/x) - PI , x < 0 and y < 0
+  cvm_emit_min_const(ctx, x, tl_buf4, fmt, 0.0);
+  cvm_atan2_emit_case_3(ctx, y, tl_buf4, tl_buf, tl_buf2, tl_buf3, tl_buf5, tl_y0_buf, tl_slope_buf,
+                        tl_invert_buf, tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa,
+                        tl_buf6, fmt);
+  cvm_emit_add_const(ctx, tl_buf6, tl_buf6, fmt, -1.0 * M_PI);
+  // x < 0 and y < 0
+
+  // we leverage x <= 0 and y <= 0 cuz we filter out x = 0 case, speed up it
+  // x < 0
+  cvm_emit_mask_lt0(ctx, x, tl_buf, tl_pos_neg_table, tl_buf2, fmt);
+  // y < 0
+  cvm_emit_mask_lt0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt);
+  // x < 0 && y < 0
+  cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
+
+  cvm_emit_mul(ctx, tl_buf6, tl_buf2, tl_buf, fmt);
+  cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+
+  // pi / 2, x = 0 and y > 0
+  // x = 0
+  cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt);  // 0.003 could consider 1
+
+  // y > 0
+  cvm_emit_mask_gt0(ctx, y, tl_buf, tl_buf5, tl_buf4, tl_pos_neg_table, tl_0_idx_table, tl_buf3,
+                    fmt);
+  // x = 0 && y > 0
+  cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
+  cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, fmt, M_PI / 2.0);
+
+  cvm_emit_add(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+
+  // -pi / 2, x = 0 and y < 0
+  // x = 0
+  cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt);  // 0.003 could consider 1
+  // y < 0
+  cvm_emit_mask_lt0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt);
+  // x = 0 && y < 0
+  cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
+
+  cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, fmt, -1.0 * M_PI / 2.0);
+
+  cvm_emit_add(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+
+  // 0, x = 0 and y = 0
+  // x = 0
+  cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt);  // 0.003 could consider 1
+  // y = 0
+  cvm_emit_mask_eq0(ctx, y, tl_buf, tl_0_idx_table, tl_buf3, fmt);  // 0.003 could consider 1
+
+  // x = 0 && y = 0
+  cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf3, fmt);
+
+  // !(x = 0 and y = 0) keep it
+  cvm_emit_0_1_revert_input(ctx, tl_buf3, tl_buf, tl_buf2, fmt);
+  cvm_emit_mul(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+}
+
+// ==== fast version ===
+static void __cvm_atan2_fast_emit_case_3(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x,
+                                         cvk_tl_t* tl_buf, cvk_tl_t* tl_y0_buf,
+                                         cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table,
+                                         cvk_tl_t* tl_table_answer,
+                                         cvk_tl_t* tl_table_answer_mantissa,
+                                         cvk_tl_t* OUT tl_ofmap_bf16, cvk_tl_t* OUT y_over_x,
+                                         cvk_fmt_t fmt, float b) {
+  // case 3
+  // atan( y / x)
+
+#if 0
+  // x0 = reciprocal(x)
+  _cvm_lut_exp_mantissa(ctx,
+      x,
+      NULL,
+      tl_table_answer,
+      tl_table_answer_mantissa,
+      tl_buf,
+      true
+      );
+
+  // y0 = x0 * y
+  cvm_emit_mul(ctx, y, tl_buf, tl_buf, fmt);
+#else
+  cvm_emit_x_over_y(ctx, y, x, NULL, tl_buf, tl_table_answer, tl_table_answer_mantissa, fmt, true);
+
+  if (y_over_x) {
+    cvm_emit_add_const(ctx, tl_buf, y_over_x, fmt, 0);
+  }
+#endif
+
+  // x0 = atan(y0)
+  _cvm_atan_fast_emit(ctx, tl_buf, x, NULL, tl_y0_buf, tl_invert_buf, tl_pos_neg_table,
+                      tl_table_answer, tl_table_answer_mantissa, OUT tl_ofmap_bf16, fmt, b, true);
+}
+
+#if 0
+static void _cvm_atan2_fast_emit(cvk_context_t *ctx,
+    cvk_tl_t* y,
+    cvk_tl_t* tl_buf,
+    cvk_tl_t* tl_buf2,
+    cvk_tl_t* tl_buf4,
+    cvk_tl_t* tl_y0_buf,
+    cvk_tl_t* tl_invert_buf,
+    cvk_tl_t* tl_pos_neg_table,
+    cvk_tl_t* tl_table_answer,
+    cvk_tl_t* tl_table_answer_mantissa,
+    cvk_tl_t* OUT tl_ofmap_bf16,
+    cvk_tl_t* OUT tl_buf3,
+    cvk_fmt_t fmt) {
+  // case 3
+  // atan( y / x)
+
+#if 0
+  // x0 = reciprocal(tl_buf)
+  _cvm_lut_exp_mantissa(ctx,
+      tl_buf,
+      NULL,
+      tl_table_answer,
+      tl_table_answer_mantissa,
+      tl_buf2,
+      true
+      );
+
+  // y0 = x0 * y
+  cvm_emit_mul(ctx, y, tl_buf2, tl_buf2, fmt);
+#else
+#if 0
+  cvm_emit_x_over_y(ctx, y, tl_buf, NULL, tl_buf2,
+      tl_table_answer, tl_table_answer_mantissa, fmt, true);
+
+  if (tl_buf3) {
+	bf16_emit_add_const(ctx, tl_buf2, tl_buf3, fmt, 0);
+  }
+#else
+  //if (tl_buf3) {
+  //  cvm_emit_add_const(ctx, tl_buf, tl_buf3, fmt, 0);
+  //}
+
+  // get xy == 0 and y < 0, add pi
+  // using xy to depend x = 0 or y = 0
+  // recipical y < 0 get 0xFEFF, y > 0 get 0x7F7F,
+  // 1. b = xy to get other/(x = 0 or y = 0)
+  // 2. c = b * 2^64 to saturate it
+  // 3. c(bf16) = c(int8) >> 10 to get 1/0 map, 1 indicate xy > 0
+  // 4. c = c * -1 + 1 to invert map, 1 indicate x = 0 or y = 0
+  // 5. d = b(int8) - 0x7f, 0 means y > 0
+  // 6. d = d(int8) + 0xff to get inf
+  cvm_emit_mul(ctx, y, tl_buf, tl_buf2, fmt);
+  // get 7f7f / 0
+  cvm_emit_mul_const(ctx, tl_buf2, tl_ofmap_bf16, fmt, convert_bf16_fp32(0x7f00));
+  //// 1 = 0x3f80
+  //bf16_emit_mul_const(ctx, tl_buf2, tl_ofmap_bf16, fmt, 0);
+  //bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_buf4, fmt, 1.0);
+  // bf16->uint8_t and back uint8_t->bf16 to get 0/1 map
+
+#if 1
+  cvk_tl_t index_uint8_t;
+  bmk1880v2_tensor_lmem_s_copy_bf16_8(ctx, &index_uint8_t, tl_buf2, CVK_FMT_U8);
+
+  index_uint8_t.shape.w = index_uint8_t.shape.w / 2;
+  index_uint8_t.stride = ctx->ops->tl_default_stride(ctx, index_uint8_t.shape,
+	  CTRL_NULL, CVK_FMT_I8);
+
+  index_uint8_t.fmt = CVK_FMT_I8;
+
+  cvk_tdma_l2l_tensor_copy_param_t p1;
+  p1.src = tl_ofmap_bf16;
+  p1.dst = &index_uint8_t;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+  cvk_tiu_mul_param_t p;
+
+#if 0
+
+  p.res_high = NULL;
+  p.res_low = &index_uint8_t;
+  p.a = &index_uint8_t;
+  p.b_is_const = 1;
+  p.b_const.val =-1;
+  p.b_const.is_signed = 1;
+  p.rshift_bits = 0;
+  p.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p);
+#else
+  p.res_high = NULL;
+  p.res_low = &index_uint8_t;
+  p.a = &index_uint8_t;
+  p.b_is_const = 1;
+  p.b_const.val =-1;
+  p.b_const.is_signed = 1;
+  p.rshift_bits = 7;
+  p.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p);
+
+#endif
+
+  // get -1/0 map, -1 indicate xy != 0
+  p1.src = &index_uint8_t;
+  p1.dst = tl_ofmap_bf16;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+
+  // x * (-1) + 1 get 0/1 map, 1 indicate xy == 0
+  //bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, -1.0);
+  cvm_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 1.0);
+
+  // get y < 0, bf16->int8 and mul 0xff to get -128 and righshift to get 1
+  cvm_emit_mul_const(ctx, y, tl_buf3, fmt, pow(2,64));
+  p1.src = tl_buf3;
+  p1.dst = &index_uint8_t;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+
+
+  p.res_high = 0;
+  p.res_low = &index_uint8_t;
+  p.a = &index_uint8_t;
+  p.b_is_const = 1;
+  p.b_const.val =-128;
+  p.b_const.is_signed = 1;
+  p.rshift_bits = 0;
+  p.relu_enable = 1;
+
+  ctx->ops->tiu_mul(ctx, &p);
+
+  p.res_high = 0;
+  p.res_low = &index_uint8_t;
+  p.a = &index_uint8_t;
+  p.b_is_const = 1;
+  p.b_const.val =1;
+  p.b_const.is_signed = 1;
+  p.rshift_bits = 7;
+  p.relu_enable = 1;
+
+  ctx->ops->tiu_mul(ctx, &p);
+
+  // get y < 0
+  p1.src = &index_uint8_t;
+  p1.dst = tl_buf4;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+  cvm_emit_mul_const(ctx, tl_buf4, tl_buf4, fmt, -1.0);
+
+  // get y > 0
+  // y * (-1) + 1 get 0/1 map, 1 indicate xy == 0
+  cvm_emit_add_const(ctx, tl_buf4, tl_buf2, fmt, 1.0);
+  cvm_emit_add(ctx, tl_buf2, tl_buf4, tl_buf2, fmt);
+
+  // merge y > 0 && y < 0 && x == 0
+  cvm_emit_mul(ctx, tl_ofmap_bf16, tl_buf2, tl_buf3, fmt);
+  //bf16_emit_add_const(ctx, tl_ofmap_bf16, tl_ofmap_bf16, fmt, 0);
+  //bf16_emit_mul_const(ctx, tl_ofmap_bf16, tl_buf3, fmt, M_PI);
+
+#endif
+
+
+  cvm_emit_x_over_y(ctx, y, tl_buf, NULL, tl_buf2,
+      tl_table_answer, tl_table_answer_mantissa, fmt, true);
+#endif
+#endif
+
+  // x0 = atan(y0)
+  __cvm_atan_fast_emit(ctx,
+      tl_buf2,
+      tl_buf,
+      tl_buf4,
+      tl_y0_buf,
+      tl_invert_buf,
+      tl_pos_neg_table,
+      tl_table_answer,
+      tl_table_answer_mantissa,
+      OUT tl_ofmap_bf16,
+      fmt);
+
+  // abs tl_buf3
+  // revert and mul to clean !(x == 0 && (y != 0) case
+  // add pi/2
+  cvm_emit_mul_const(ctx, tl_buf3, tl_buf2, fmt, -1);
+  cvk_tiu_min_param_t p3;
+  p3.min = tl_buf2;
+  p3.a = tl_buf3;
+  p3.b_is_const = 0;
+  p3.b =  tl_buf2;
+
+  ctx->ops->tiu_min(ctx, &p3);
+  cvm_emit_add_const(ctx, tl_buf2, tl_buf2, fmt, 1.0);
+  cvm_emit_mul(ctx, tl_ofmap_bf16, tl_buf2, tl_ofmap_bf16, fmt);
+
+  cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, M_PI_2);
+  cvm_emit_add(ctx, tl_buf3, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+}
+#endif
+
+static void _cvm_atan2_fast_emit_case_3(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x,
+                                        cvk_tl_t* tl_buf, cvk_tl_t* tl_y0_buf,
+                                        cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table,
+                                        cvk_tl_t* tl_table_answer,
+                                        cvk_tl_t* tl_table_answer_mantissa,
+                                        cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, float b) {
+  // case 3
+  // atan( y / x)
+  return __cvm_atan2_fast_emit_case_3(ctx, y, x, tl_buf, tl_y0_buf, tl_invert_buf, tl_pos_neg_table,
+                                      tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16,
+                                      NULL, fmt, b);
+}
+
+void cvm_atan2_fast_emit(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf,
+                         cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_buf4,
+                         cvk_tl_t* tl_y0_buf, cvk_tl_t* tl_slope_buf, cvk_tl_t* tl_invert_buf,
+                         cvk_tl_t* tl_pos_neg_table, cvk_tl_t* tl_table_answer,
+                         cvk_tl_t* tl_table_answer_mantissa, cvk_tl_t* tl_0_idx_table,
+                         cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  cvm_table_check(y, tl_y0_buf, tl_slope_buf, x);
+  cvm_table_check(tl_buf, tl_invert_buf, tl_pos_neg_table, tl_buf2);
+  cvm_table_check(tl_buf3, tl_table_answer, tl_table_answer_mantissa, tl_buf4);
+  cvm_table_check(tl_buf4, tl_table_answer, tl_0_idx_table, tl_buf4);
+
+  // atan(y/x), x > 0
+  // atan(y/x) + PI , x < 0 and y >= 0
+  // atan(y/x) - PI , x < 0 and y < 0
+  // pi / 2, x = 0 and y > 0
+  // -pi / 2, x = 0 and y < 0
+  // 0, x = 0 and y = 0
+
+  // atan(y/x), x > 0
+  cvm_emit_max_const(ctx, x, tl_buf, fmt, 0.0);
+  _cvm_atan2_fast_emit_case_3(ctx, y, tl_buf, tl_buf2, tl_y0_buf, tl_invert_buf, tl_pos_neg_table,
+                              tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16, fmt, 0.0);
+
+  // x > 0
+  cvm_emit_mask_gt0(ctx, x, tl_buf, tl_buf2, tl_buf3, tl_pos_neg_table, tl_0_idx_table, tl_buf,
+                    fmt);
+
+  cvm_emit_mul(ctx, tl_ofmap_bf16, tl_buf, tl_ofmap_bf16, fmt);
+
+  // atan(y/x) + PI , x < 0 and y >= 0
+  cvm_emit_min_const(ctx, x, tl_buf, fmt, 0.0);
+  _cvm_atan2_fast_emit_case_3(ctx, y, tl_buf, tl_buf2, tl_y0_buf, tl_invert_buf, tl_pos_neg_table,
+                              tl_table_answer, tl_table_answer_mantissa, tl_buf4, fmt, M_PI);
+  // cvm_emit_add_const(ctx, tl_buf4, tl_buf4, fmt, M_PI);
+
+  // get index map that x < 0 and y >= 0
+  // !(y >= 0) = !(y < 0)
+#if 0
+  cvm_emit_pos_idx(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt);
+  // y == 0
+  cvm_emit_0_idx(ctx, y, tl_buf, tl_0_idx_table, tl_buf2, fmt);
+  cvm_emit_add(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
+#else
+  // y >= 0
+  cvm_emit_mask_ge0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf2, fmt);
+#endif
+  // x < 0
+  cvm_emit_mask_lt0(ctx, x, tl_buf, tl_pos_neg_table, tl_buf3, fmt);
+  // x < 0 && y >= 0
+  cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
+
+  cvm_emit_mul(ctx, tl_buf4, tl_buf2, tl_buf, fmt);
+  cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+
+  // atan(y/x) - PI , x < 0 and y < 0
+  cvm_emit_min_const(ctx, x, tl_buf, fmt, 0.0);
+  _cvm_atan2_fast_emit_case_3(ctx, y, tl_buf, tl_buf2, tl_y0_buf, tl_invert_buf, tl_pos_neg_table,
+                              tl_table_answer, tl_table_answer_mantissa, tl_buf4, fmt, 0.0);
+  cvm_emit_add_const(ctx, tl_buf4, tl_buf4, fmt, -1.0 * M_PI);
+  // x < 0 and y < 0
+
+  // we leverage x <= 0 and y <= 0 cuz we filter out x = 0 case, speed up it
+  // x < 0
+  cvm_emit_mask_lt0(ctx, x, tl_buf, tl_pos_neg_table, tl_buf2, fmt);
+  // y < 0
+  cvm_emit_mask_lt0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt);
+  // x < 0 && y < 0
+  cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
+
+  cvm_emit_mul(ctx, tl_buf4, tl_buf2, tl_buf, fmt);
+  cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+
+  // pi / 2, x = 0 and y > 0
+  // x = 0
+  cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt);  // 0.003 could consider 1
+
+  // y > 0
+  // cvm_emit_mask_gt0(ctx, y, tl_buf, tl_buf5, tl_buf4, tl_pos_neg_table, tl_0_idx_table, tl_buf3,
+  // fmt);
+  _cvm_emit_mask(ctx, y, tl_buf, tl_buf4, NULL, tl_pos_neg_table, tl_0_idx_table, tl_buf3, fmt,
+                 CVM_MASK_TYPE_GT_0, true);
+  // x = 0 && y > 0
+  cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
+  cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, fmt, M_PI / 2.0);
+
+  cvm_emit_add(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+
+  // -pi / 2, x = 0 and y < 0
+  // x = 0
+  cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt);  // 0.003 could consider 1
+  // y < 0
+  cvm_emit_mask_lt0(ctx, y, tl_buf, tl_pos_neg_table, tl_buf3, fmt);
+  // x = 0 && y < 0
+  cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
+
+  cvm_emit_mul_const(ctx, tl_buf2, tl_buf2, fmt, -1.0 * M_PI / 2.0);
+
+  cvm_emit_add(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+
+  // 0, x = 0 and y = 0
+  // x = 0
+  cvm_emit_mask_eq0(ctx, x, tl_buf, tl_0_idx_table, tl_buf2, fmt);  // 0.003 could consider 1
+  // y = 0
+  cvm_emit_mask_eq0(ctx, y, tl_buf, tl_0_idx_table, tl_buf3, fmt);  // 0.003 could consider 1
+
+  // x = 0 && y = 0
+  cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf3, fmt);
+
+  // !(x = 0 and y = 0) keep it
+  cvm_emit_0_1_revert_input(ctx, tl_buf3, tl_buf, tl_buf2, fmt);
+  cvm_emit_mul(ctx, tl_buf2, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+}
+
+static void _x_lt_0(cvk_context_t* ctx, cvk_tl_t* x, cvk_tl_t* tl_buf, cvk_tl_t* index_i8,
+                    cvk_fmt_t fmt, cvk_tl_t* OUT tl_buf2) {
+  cvk_tiu_min_param_t p7;
+  cvk_tiu_mul_param_t p;
+  cvk_tdma_l2l_tensor_copy_param_t p1;
+
+  // x < 0
+  p7.min = tl_buf;
+  p7.a = x;
+  p7.b_is_const = 1;
+  p7.b_const.val = 0;
+  p7.b_const.is_signed = 1;
+
+  ctx->ops->tiu_min(ctx, &p7);
+  cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, pow(2, 64));
+
+  p1.src = tl_buf;
+  p1.dst = index_i8;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+
+  p.res_high = 0;
+  p.res_low = index_i8;
+  p.a = index_i8;
+  p.b_is_const = 1;
+  p.b_const.val = -128;
+  p.b_const.is_signed = 1;
+  p.rshift_bits = 0;
+  p.relu_enable = 1;
+
+  ctx->ops->tiu_mul(ctx, &p);
+
+  p.res_high = 0;
+  p.res_low = index_i8;
+  p.a = index_i8;
+  p.b_is_const = 1;
+  p.b_const.val = 1;
+  p.b_const.is_signed = 1;
+  p.rshift_bits = 7;
+  p.relu_enable = 1;
+
+  ctx->ops->tiu_mul(ctx, &p);
+
+  // get x < 0
+  p1.src = index_i8;
+  p1.dst = tl_buf2;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+}
+
+static void _cvm_atan2_merge_emit(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf,
+                                  cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_y0_buf,
+                                  cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table,
+                                  cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa,
+                                  cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt, float degree_factor) {
+  cvm_table_check(y, tl_y0_buf, tl_invert_buf, x);
+  cvm_table_check(tl_buf, tl_invert_buf, tl_pos_neg_table, tl_buf2);
+  cvm_table_check(tl_buf3, tl_table_answer, tl_table_answer_mantissa, tl_ofmap_bf16);
+
+  cvk_tl_t index_i8;
+  bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_buf2, CVK_FMT_I8);
+
+  /**
+   * step 1. atan(y/x)
+   */
+  cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, 0.0);
+  cvm_emit_add(ctx, x, tl_buf, tl_buf, fmt);
+
+#if 0
+  // get y < 0, bf16->int8 and mul 0xff to get -128 and righshift to get 1
+  cvk_tiu_mul_param_t p;
+  cvk_tdma_l2l_tensor_copy_param_t p1;
+  cvm_emit_mul_const(ctx, y, tl_buf3, fmt, pow(2,64));
+  p1.src = tl_buf3;
+  p1.dst = &index_i8;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+
+
+  p.res_high = 0;
+  p.res_low = &index_i8;
+  p.a = &index_i8;
+  p.b_is_const = 1;
+  p.b_const.val =-128;
+  p.b_const.is_signed = 1;
+  p.rshift_bits = 0;
+  p.relu_enable = 1;
+
+  ctx->ops->tiu_mul(ctx, &p);
+
+  p.res_high = 0;
+  p.res_low = &index_i8;
+  p.a = &index_i8;
+  p.b_is_const = 1;
+  p.b_const.val =1;
+  p.b_const.is_signed = 1;
+  p.rshift_bits = 7;
+  p.relu_enable = 1;
+
+  ctx->ops->tiu_mul(ctx, &p);
+
+  // get y < 0
+  p1.src = &index_i8;
+  p1.dst = tl_buf3;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+  cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -1.0);
+
+  // get y > 0
+  // y * (-1) + 1 get 0/1 map, 1 indicate xy == 0
+  cvm_emit_add_const(ctx, tl_buf3, tl_buf2, fmt, 1.0);
+
+  // reduce y == 0
+  if (0)
+  {
+    cvk_tiu_max_param_t p3;
+    cvk_tl_t index_i8;
+    bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_ofmap_bf16, CVK_FMT_I8);
+    cvm_emit_mul_const(ctx, y, tl_buf, fmt, -1);
+    p3.max = tl_buf;
+    p3.a = y;
+    p3.b_is_const = 0;
+    p3.b =  tl_buf;
+
+    ctx->ops->tiu_max(ctx, &p3);
+    cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, convert_bf16_fp32(0x7f00));
+    //bf16_emit_mul_const(ctx, tl_buf, tl_buf, fmt, pow(2, 64));
+
+    p1.src = tl_buf;
+    p1.dst = &index_i8;
+    ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+
+    p.res_high = NULL;
+    p.res_low = &index_i8;
+    p.a = &index_i8;
+    p.b_is_const = 1;
+    p.b_const.val =-1;
+    p.b_const.is_signed = 1;
+    p.rshift_bits = 7;
+    p.relu_enable = 0;
+
+    ctx->ops->tiu_mul(ctx, &p);
+
+
+    p1.src = &index_i8;
+    p1.dst = tl_buf3;
+    ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p1);
+
+    //revert it
+    cvm_emit_mul_const(ctx, tl_buf3, tl_buf3, fmt, -1.0);
+    //bf16_emit_add_const(ctx, tl_buf3, tl_buf3, fmt, 1);
+    cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf2, fmt);
+  }
+
+  cvm_emit_add(ctx, tl_buf2, tl_buf3, tl_buf3, fmt);
+#endif
+
+  cvm_emit_x_over_y(ctx, y, tl_buf, NULL, tl_buf2, tl_table_answer, tl_table_answer_mantissa, fmt,
+                    true);
+
+  // x0 = atan(y0)
+  __cvm_atan_fast_emit(ctx, tl_buf2, tl_buf, tl_buf3, tl_y0_buf, tl_invert_buf, tl_pos_neg_table,
+                       tl_table_answer, tl_table_answer_mantissa, OUT tl_ofmap_bf16, fmt);
+
+  bmk1880v2_tensor_lmem_s_copy_l2l_bf16_8(ctx, &index_i8, tl_buf, CVK_FMT_I8);
+
+  // seperate y >= 0 or < 0 to handle 0 degree / 180 degree
+  cvm_emit_mask_ge0_lt0(ctx, y, &index_i8, tl_buf3, fmt);
+
+  /**
+   * step 2. set x == 0, y >=0 to pi/2, y < 0 to -pi/2
+   * FIXME: atan(0) not eq PI/2
+   */
+
+  // x = 0 and y != 0
+  // reset all x = 0
+  // y >= 0 as pi/2, y < 0 as -pi/2
+  // merge
+
+  cvm_emit_mask_eq_0(ctx, x, tl_buf, &index_i8, tl_buf2, fmt);
+
+  // clear x = 0
+  cvm_emit_mul_const(ctx, tl_buf2, tl_buf, fmt, -1);
+  cvm_emit_mul(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+
+  // get revert map, x = -x + 1 cuz original -1 menas x != 0
+  cvm_emit_mul_const(ctx, tl_buf3, tl_buf, fmt, M_PI_2 * degree_factor);
+  cvm_emit_add_const(ctx, tl_buf2, tl_buf2, fmt, 1);
+
+  cvm_emit_mul(ctx, tl_buf, tl_buf2, tl_buf, fmt);
+
+  cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+
+  //  return;
+  /**
+   *   step 3. handle x < 0 && y != 0
+   */
+
+  // x < 0
+  _x_lt_0(ctx, x, tl_buf, &index_i8, fmt, tl_buf2);
+
+  // x < 0 && (y >= 1 && y < 1)
+  cvm_emit_mul(ctx, tl_buf2, tl_buf3, tl_buf, fmt);
+  cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, M_PI * degree_factor);
+  cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+
+  /**
+   * 4. handle x != 0 && y == 0, x>0: 0, x<0: PI, tpu atan default all pi/2
+   */
+  // tl_buf2 as x < 0
+  // get y == 0, tl_buf3 keep y>=0 is 1, y<1 = -1
+  cvm_emit_mask_eq_0(ctx, y, tl_buf, &index_i8, tl_buf3, fmt);
+  // revert
+  cvm_emit_mul_const(ctx, tl_buf3, tl_buf, fmt, -1.0);
+
+  // reset y = 0 x = ? as 0, other case leave to step 5
+  cvm_emit_mul(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+
+  /**
+   * 5. set y == 0 and x < 0 as pi
+   */
+
+  // get y == 0
+  cvm_emit_add_const(ctx, tl_buf3, tl_buf, fmt, 1.0);
+  // y == 0 && x < 0
+  cvm_emit_mul(ctx, tl_buf, tl_buf2, tl_buf, fmt);
+  cvm_emit_mul_const(ctx, tl_buf, tl_buf, fmt, M_PI * degree_factor);
+
+  // merge
+  cvm_emit_add(ctx, tl_buf, tl_ofmap_bf16, tl_ofmap_bf16, fmt);
+  return;
+}
+
+/**
+ * \brief reduce lut table with following step
+ * 1. atan(y/x)
+ * 2. handle x = 0 && y != 0, directly set pi/2, -pi/2
+ * 3. handle x < 0 && y != 0
+ * => y>0: PI/2, y <0: -PI/2, tpu atan default y>0: -PI/2, y <0: PI/2
+ * 4. handle x != 0 && y == 0, x>0: 0, x<0: PI, tpu atan default all pi/2
+ * 5. handle x = 0 && y = 0 => PI
+ */
+void cvm_atan2_merge_emit(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf,
+                          cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_y0_buf,
+                          cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table,
+                          cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa,
+                          cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  return _cvm_atan2_merge_emit(ctx, y, x, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, tl_invert_buf,
+                               tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa,
+                               tl_ofmap_bf16, fmt, 1.0);
+}
+
+void cvm_atan2_fast_degree_emit(cvk_context_t* ctx, cvk_tl_t* y, cvk_tl_t* x, cvk_tl_t* tl_buf,
+                                cvk_tl_t* tl_buf2, cvk_tl_t* tl_buf3, cvk_tl_t* tl_y0_buf,
+                                cvk_tl_t* tl_invert_buf, cvk_tl_t* tl_pos_neg_table,
+                                cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_mantissa,
+                                cvk_tl_t* OUT tl_ofmap_bf16, cvk_fmt_t fmt) {
+  return _cvm_atan2_merge_emit(ctx, y, x, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, tl_invert_buf,
+                               tl_pos_neg_table, tl_table_answer, tl_table_answer_mantissa,
+                               tl_ofmap_bf16, fmt, 180 / M_PI);
+}
diff --git a/cvimath/src/tiu_reciprocal.c b/cvimath/src/tiu_reciprocal.c
new file mode 100644
index 000000000..5cf16154d
--- /dev/null
+++ b/cvimath/src/tiu_reciprocal.c
@@ -0,0 +1,149 @@
+/**
+ */
+#include <cvimath_internal.h>
+#include "gen_lut.h"  // NOLINT
+
+//#define DBG
+
+/*
+ * NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type
+ *
+ * \tl_buf tmp buffer, the shape MUST be same with \tl_ifmap
+ * \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used
+ */
+int cvm_emit_reciprocal(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_buf,
+                        cvk_tl_t* tbl_answer, cvk_tl_t* tbl_answer_mantissa,
+                        cvk_tl_t* OUT tl_ofmap_bf16) {
+  return cvm_lut_exp_mantissa(ctx, tl_ifmap, tl_buf, tbl_answer, tbl_answer_mantissa,
+                              tl_ofmap_bf16);
+}
+
+// <! gen reciprocal f(x) = 1/x
+static double _gen_reciprocal(int base, int p) {
+  // y = x ^ -1
+  double f = (double)(pow(base, -1 * p));
+
+  if (isnan(f)) {
+    ASSERT(0);
+  }
+  return f;
+}
+
+void cvm_gen_reciprocal(uint16_t* table_data, cvk_tl_shape_t* table_shape) {
+  ASSERT(is_1880v2_tbl_shape(table_shape));
+
+  int exp_start = cvm_exp_start();
+  int half = half_h_table();
+  int table_hw = cvm_table_hw();
+  uint64_t idx = 0;
+
+  // prepare channel 0
+  double s = 0.0;
+  // 0^-1 is invalid, use positive/negtive max value: 0x7F7F / 0xFF7F
+  // table_data[idx] = 0xff7f; //<! convert to 0xff7f, mulitply slope[0](0.5) is feff
+  table_data[idx] = 0x7F80;  //<! convert to 0x7F7F
+#ifdef DBG
+  printf("t [%lu] is %f  bf %x\n", idx, convert_bf16_fp32(table_data[idx]), table_data[idx]);
+#endif
+  idx++;
+
+  // > 0, exp from 0 -62 -61 ..  62  63
+  for (int i = 0; i < half - 1; i++) {
+    int shift = (exp_start + i);
+    bool is_odd = (shift % 2);
+    float exp = shift;
+    if (is_odd) {
+      exp = exp - 1;
+    }
+
+    double s = _gen_reciprocal(2, exp);
+    table_data[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+    printf("t [%lu] is %f [idx:%f][2^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]),
+           (float)(exp_start + i), -1 * exp, table_data[idx]);
+#endif
+    idx++;
+  }
+
+  s = _gen_reciprocal(2, -0);
+  table_data[idx] = convert_fp32_bf16(s);
+  table_data[idx] = 0x7F80;  //<! convert to 0x7F7F
+#ifdef DBG
+  printf("t [%lu] is %f[%d] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), 0, table_data[idx]);
+#endif
+  idx++;
+
+  // < 0, exp from 0 -62 -61 ..  62  63
+  for (int i = 0; i < half - 1; i++) {
+    int shift = (exp_start + i);
+    bool is_odd = (shift % 2);
+    float exp = shift;
+    if (is_odd) {
+      exp = exp - 1;
+    }
+
+    double s = -1 * _gen_reciprocal(-2, exp);
+    table_data[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+    printf("t [%lu] is %f(%e - %.8lf)[(-2)^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s,
+           s, exp, table_data[idx]);
+#endif
+    idx++;
+  }
+
+  // idx = 255 dont care
+  // s = _gen_reciprocal(2, 0);
+  // table_data[idx] = convert_fp32_bf16(s);
+  // printf("t [%lu] is %f[%d]\n", idx, convert_bf16_fp32(table_data[idx]), 0);
+  // idx++;
+
+  // duplicate channel #1 to #31
+  // TODO: tensor copy
+  for (uint32_t i = 1; i < table_shape->c; i++) {
+    memcpy(&table_data[i * table_hw], &table_data[0], sizeof(uint16_t) * table_hw);
+  }
+}
+
+void cvm_gen_reciprocal_mantissa(uint16_t* OUT table_mantissa, cvk_tl_shape_t* table_shape) {
+  ASSERT(is_1880v2_tbl_shape(table_shape));
+
+  uint32_t half = half_h_table();
+  int table_hw = cvm_table_hw();
+
+  int idx = 0;
+  double d;
+  for (uint32_t i = 0; i < half; i++) {
+    d = 1 + i * 1 / 128.0;
+    d = (double)pow(d, -1);
+    table_mantissa[128 + idx] = convert_fp32_bf16(d);
+
+    // 13=2^3x1.625=(2^2)x(2^1x1.625)
+    d = 2 * (1 + i * 1 / 128.0);
+    d = (double)pow(d, -1);
+    table_mantissa[idx] = convert_fp32_bf16(d);
+    idx++;
+  }
+
+#ifdef DBG
+  for (uint32_t i = 0; i < 2 * half; i++) {
+    printf("mantissa [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_mantissa[i]),
+           table_mantissa[i]);
+  }
+#endif /* ifdef DBG */
+
+  // duplicate channel #1 to #31
+  // TODO: tensor copy
+  for (uint64_t i = 1; i < table_shape->c; i++) {
+    memcpy(&table_mantissa[table_hw * i], &table_mantissa[0], sizeof(uint16_t) * table_hw);
+  }
+}
+
+void cvm_reciprocal_tbl(uint16_t* table_data, uint16_t* table_mantissa,
+                        cvk_tl_shape_t* table_shape) {
+  ASSERT(table_data);
+  ASSERT(table_mantissa);
+  ASSERT(table_shape);
+
+  cvm_gen_reciprocal(table_data, table_shape);
+  cvm_gen_reciprocal_mantissa(table_mantissa, table_shape);
+}
diff --git a/cvimath/src/tiu_reshape_c.c b/cvimath/src/tiu_reshape_c.c
new file mode 100644
index 000000000..e099c84c7
--- /dev/null
+++ b/cvimath/src/tiu_reshape_c.c
@@ -0,0 +1,387 @@
+/**
+ * reshape channel under depthwise
+ */
+//
+
+#include <cvimath_internal.h>
+#include "gen_lut.h"  // NOLINT
+
+//#define DBG
+// copy from \1880v2_test_util.h
+static int calc_dilute_hw(int h, int ins_h, int ins_h_l, int pad_h_b, int pad_h_t) {
+  return (h - 1) * (ins_h + 1) + ins_h_l + 1 + pad_h_t + pad_h_b;
+}
+
+// get padding as 'SAME' mode in tensorflow
+// https://www.jianshu.com/p/05c4f1621c7e
+static int get_same_pad(int ih, int sh, int kh) {
+  return (((ih + sh - 1) / sh) - 1) * sh + kh - ih;
+}
+
+// get real 'h' with pad/ins
+static int pooling_ih_ext(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int ih) {
+  int ins = ins_h;
+  int ins_last = ins_last_h;
+  int pad = pad_top + pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+// get real 'w' with pad/ins
+static int pooling_iw_ext(int ins_w, int ins_last_w, int pad_left, int pad_right, int iw) {
+  int ins = ins_w;
+  int ins_last = ins_last_w;
+  int pad = pad_left + pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+// get output h with parameter
+static int pooling_oh(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int stride_h, int ih,
+                      int kh, int dh) {
+  int ih_ext = pooling_ih_ext(ins_h, ins_last_h, pad_top, pad_bottom, ih);
+  int d_h = (kh - 1) * dh + 1;
+  return (ih_ext - d_h) / stride_h + 1;
+}
+
+// get output w with parameter
+static int pooling_ow(int ins_w, int ins_last_w, int pad_left, int pad_right, int stride_w, int iw,
+                      int kw, int dw) {
+  int iw_ext = pooling_iw_ext(ins_w, ins_last_w, pad_left, pad_right, iw);
+  int d_w = (kw - 1) * dw + 1;
+  return (iw_ext - d_w) / stride_w + 1;
+}
+
+/**
+ * \brief get extended bias
+ * \return allocated new bias
+ */
+uint32_t* cvm_reshape_channel_bias(uint8_t* bias, int ni, int ci, int hi, int wi, int old_bias_c,
+                                   cvk_fmt_t fmt) {
+  ASSERT(bias);
+  ASSERT((ni == 2 || ni == 1) && "not support bias batch > 1");
+  ASSERT(ci / old_bias_c > 0 && ci % old_bias_c == 0);
+  int sz = fmt == CVK_FMT_BF16 ? 4 : 2;
+
+  int d_c_bias_sz = ni * ci * hi * wi;
+  uint8_t* new_bias = (uint8_t*)malloc(d_c_bias_sz * sz);
+  int bias_hw = hi * wi;
+  int duplicat_c = ci / old_bias_c;
+
+  for (int c = 0; c < old_bias_c; c++) {
+    int shift = (c * bias_hw) * sz;
+    for (int i = 0; i < duplicat_c; i++) {
+      int new_bias_shift = (c * duplicat_c + i) * bias_hw * sz;
+      memcpy(&new_bias[new_bias_shift], &bias[shift], bias_hw * sz);
+    }
+  }
+  return (uint32_t*)new_bias;
+}
+
+/*
+ * \brief prepare load shape/stride
+ * \return -1 means fail to reshape, 0 means success
+ * \TODO check memory usage
+ */
+static inline int _get_dup_shape(cvk_context_t* ctx, int in, int ic, int ih, int iw, int d_kh,
+                                 int stride_h, int npu_num, cvk_tl_shape_t* tl_shape,
+                                 cvk_tl_stride_t* tl_load_stride, cvk_tg_shape_t* tg_shape,
+                                 cvk_tg_stride_t* tg_stride, cvk_fmt_t src_tg_fmt,
+                                 cvk_fmt_t dst_tl_fmt) {
+  ASSERT(in > 0 && ic > 0 && ih > 0 && iw > 0 && d_kh > 0 && stride_h > 0);
+  ASSERT(tl_shape && tl_load_stride && tg_shape && tg_stride);
+
+  // 1. reshape and extend c, h axis in order
+  int ch = ic * ih;
+  int oc;
+  int oh;
+
+  // FIXME: check kernel setting
+  oh = 0;
+
+  for (int i = npu_num / ic; i > 0; i--) {
+#if 0
+    int hw = ih * iw;
+    int _oh = hw / i / iw;
+    if (hw % i == 0 && (hw / i) % stride_h == 0 && _oh >= stride_h) {
+      oh = _oh;
+      break;
+    }
+#else
+    int _oh = ih / i;
+    if (ih % i == 0 && (_oh) % stride_h == 0 && _oh >= stride_h /*&& _oh >= d_kh*/) {
+      oh = _oh;
+      break;
+    }
+#endif
+  }
+
+  if (!oh) {
+    // FIXME: check terminal condition
+    return -1;
+  }
+
+  oc = ch / oh;
+
+#ifdef DBG
+  printf("ic:ih is %d %d, oc:oh is %d:%d\n", ic, ih, oc, oh);
+#endif
+
+  // tg/tl MUST be same shape size
+  tl_shape->n = tg_shape->n = 1;
+  tl_shape->c = tg_shape->c = oc;
+  tl_shape->h = tg_shape->h = oh;
+  tl_shape->w = tg_shape->w = iw;
+
+  // init tl
+  cvk_tl_stride_t s = ctx->ops->tl_default_stride(ctx, *tl_shape, dst_tl_fmt, CTRL_NULL);
+  tl_load_stride->n = s.n;
+  tl_load_stride->c = s.c;
+  tl_load_stride->h = s.h;
+  tl_load_stride->w = s.w;
+
+  // init tg
+  cvk_tg_stride_t gs = ctx->ops->tg_default_stride(ctx, *tg_shape, src_tg_fmt);
+
+  tg_stride->n = gs.n;
+  tg_stride->c = gs.c;
+  tg_stride->h = gs.h;
+
+  return 0;
+}
+
+/**
+ * \brief get proper reshape size for depthwise conv with 'same' mode in h direction
+ * \return -1 means alloc fail
+ * \NOTICE: not support batch/ins_x/dilated_x/pad_top/pad_bottom
+ */
+int cvm_reshape_channel_same(cvk_context_t* ctx, int ic, int ih, int iw, int kh, int kw,
+                             int pad_right, int pad_left, int stride_h, int stride_w,
+                             cvk_tl_shape_t* tl_load_shape, cvk_tl_stride_t* new_tl_ifmap_stride,
+                             cvk_tg_shape_t* new_tg_ifmap_shape,
+                             cvk_tg_stride_t* new_tg_ifmap_stride,
+                             cvk_tl_shape_t* new_tl_weight_shape, cvk_tl_shape_t* new_tl_bias_shape,
+                             cvk_tl_shape_t* new_tl_ofmap_shape, cvk_fmt_t fmt, int eu_align) {
+  ASSERT(eu_align == 0 || eu_align == 1);
+
+  cvk_chip_info_t info = ctx->info;
+  // TODO: verify dilation_h/dilation_w
+  int dilation_h = 1;
+  int dilation_w = 1;
+  // TODO: verify p->ins_h, p->ins_last_h
+  int d_kh = calc_dilute_hw(kh, dilation_h - 1, 0, 0, 0);
+  int h_after = calc_dilute_hw(ih, 0, 0, 0, 0);
+  int in = 1;
+  // int h_after = calc_dilute_hw(ih, p->ins_h, p->ins_last_h, p->pad_top, p->pad_bottom);
+  // int w_after = calc_dilute_hw(iw, p->ins_w, p->ins_last_w, p->pad_left, p->pad_right);
+  int ret = _get_dup_shape(ctx, in, ic, h_after, iw, d_kh, stride_h, info.npu_num, tl_load_shape,
+                           new_tl_ifmap_stride, new_tg_ifmap_shape, new_tg_ifmap_stride, fmt, fmt);
+
+  if (ret == -1) {
+    return ret;
+  }
+
+  new_tl_weight_shape->n = 1;
+  new_tl_weight_shape->c = tl_load_shape->c;
+  new_tl_weight_shape->h = kh;
+  new_tl_weight_shape->w = kw;
+
+  new_tl_bias_shape->n = 2;
+  new_tl_bias_shape->c = tl_load_shape->c;
+  new_tl_bias_shape->h = 1;
+  new_tl_bias_shape->w = 1;
+
+  int pad_h = get_same_pad(tl_load_shape->h, stride_h, kh);
+  // int no_pad_h = tl_load_shape->h;
+
+  // reserve for padding
+  new_tg_ifmap_shape->h += pad_h;
+  tl_load_shape->h += pad_h;
+
+  cvk_tl_stride_t s = ctx->ops->tl_default_stride(ctx, *tl_load_shape, fmt, eu_align);
+
+  new_tl_ifmap_stride->n = s.n;
+  new_tl_ifmap_stride->c = s.c;
+  new_tl_ifmap_stride->h = s.h;
+  new_tl_ifmap_stride->w = s.w;
+
+  // TODO: verity ins_x
+  int oh = pooling_oh(0, 0, 0, 0, stride_h, tl_load_shape->h, kh, dilation_h);
+  int ow = pooling_ow(0, 0, pad_left, pad_right, stride_w, tl_load_shape->w, kw, dilation_w);
+
+#ifdef DBG
+  printf("new oh/ow pad_h is %d/%d %d\n", oh, ow, pad_h);
+#endif
+  new_tl_ofmap_shape->n = in;
+  new_tl_ofmap_shape->c = tl_load_shape->c;
+  new_tl_ofmap_shape->h = oh;
+  new_tl_ofmap_shape->w = ow;
+
+  return ret;
+}
+
+/*
+ * \brief duplicate weight for reshaped c
+ */
+uint8_t* cvm_reshape_channel_weight(uint8_t* weight, int ni, int ci, int hi, int wi,
+                                    int old_weight_c, cvk_fmt_t fmt) {
+  ASSERT(weight);
+  ASSERT(ci / old_weight_c > 0 && ci % old_weight_c == 0);
+
+  int sz = fmt == CVK_FMT_BF16 ? 2 : 1;
+
+  int new_weight_hw_shape_size = hi * wi;
+  int new_weight_shape_size = ni * ci * hi * wi;
+  int duplicat_c = ci / old_weight_c;
+  uint8_t* new_weight = (uint8_t*)malloc(new_weight_shape_size * sz);
+
+  for (int n = 0; n < ni; n++) {
+    for (int c = 0; c < old_weight_c; c++) {
+      int index = (n * old_weight_c + c) * new_weight_hw_shape_size * sz;
+      for (int i = 0; i < duplicat_c; i++) {
+        int new_weight_index =
+            (n * old_weight_c * duplicat_c + c * duplicat_c + i) * new_weight_hw_shape_size * sz;
+        memcpy(&new_weight[new_weight_index], &weight[index], new_weight_hw_shape_size * sz);
+      }
+    }
+  }
+
+  return new_weight;
+}
+
+/*
+ * \brief prepare load shape/stride with pad
+ * \return -1 means fail to reshape, 0 means success
+ * \TODO check memory usage
+ */
+static inline int _get_dup_shape_same_pad(cvk_context_t* ctx, int in, int ic, int ih, int iw,
+                                          int d_kh, int stride_h, int npu_num,
+                                          cvk_tl_shape_t* tl_load_shape,
+                                          cvk_tl_stride_t* tl_load_stride, cvk_tg_shape_t* tg_shape,
+                                          cvk_tg_stride_t* tg_stride, cvk_fmt_t src_tg_fmt,
+                                          cvk_fmt_t dst_tl_fmt) {
+  ASSERT(in > 0 && ic > 0 && ih > 0 && iw > 0 && d_kh > 0 && stride_h > 0);
+  ASSERT(tl_load_shape && tl_load_stride && tg_shape && tg_stride);
+
+  // 1. reshape and extend c, h axis in order
+  int oc;
+  int oh;
+
+  // FIXME: check kernel setting
+  oh = 0;
+
+  // 2. get total output
+  // 3. slice output
+  ASSERT((ih - d_kh) % stride_h == 0);
+  int ih_ext = pooling_ih_ext(0, 0, 0, 0, ih);
+  int _oh = (ih_ext - d_kh) / stride_h + 1;
+
+  for (int i = npu_num / ic; i > 0; i--) {
+    if (_oh % i == 0) {
+      // add 1 for later padding
+      oh = stride_h * (_oh / i - 1) + 1;
+      oc = i * ic;
+      break;
+    }
+  }
+
+  if (!oh) {
+    // FIXME: check terminal condition
+    return -1;
+  }
+
+#ifdef DBG
+  printf("ic:ih is %d %d, oc:oh is %d:%d\n", ic, ih, oc, oh);
+#endif
+
+  // tg/tl MUST be same shape size
+  tl_load_shape->n = tg_shape->n = 1;
+  tl_load_shape->c = tg_shape->c = oc;
+  tl_load_shape->h = tg_shape->h = oh;
+  tl_load_shape->w = tg_shape->w = iw;
+
+  // init tl
+  cvk_tl_stride_t s = ctx->ops->tl_default_stride(ctx, *tl_load_shape, dst_tl_fmt, CTRL_NULL);
+  tl_load_stride->n = s.n;
+  tl_load_stride->c = s.c;
+  tl_load_stride->h = s.h;
+  tl_load_stride->w = s.w;
+
+  // init tg
+  cvk_tg_stride_t gs = ctx->ops->tg_default_stride(ctx, *tg_shape, src_tg_fmt);
+
+  tg_stride->n = gs.n;
+  tg_stride->c = gs.c;
+  tg_stride->h = gs.h;
+
+  return 0;
+}
+
+/**
+ * \brief get proper reshape size for depthwise conv with 'same' mode in h direction
+ * 'pad' means \ih is padded
+ * \return -1 means alloc fail
+ * \NOTICE: not support batch/ins_x/dilated_x/pad_top/pad_bottom
+ */
+int cvm_reshape_channel_same_pad(
+    cvk_context_t* ctx, int ic, int ih, int iw, int kh, int kw, int pad_right, int pad_left,
+    int stride_h, int stride_w, cvk_tl_shape_t* tl_load_shape, cvk_tl_stride_t* new_tl_ifmap_stride,
+    cvk_tg_shape_t* new_tg_ifmap_shape, cvk_tg_stride_t* new_tg_ifmap_stride,
+    cvk_tl_shape_t* new_tl_weight_shape, cvk_tl_shape_t* new_tl_bias_shape,
+    cvk_tl_shape_t* new_tl_ofmap_shape, cvk_fmt_t fmt, int eu_align) {
+  ASSERT(eu_align == 0 || eu_align == 1);
+
+  cvk_chip_info_t info = ctx->info;
+  // TODO: verify dilation_h/dilation_w
+  int dilation_h = 1;
+  int dilation_w = 1;
+  // TODO: verify p->ins_h, p->ins_last_h
+  int d_kh = calc_dilute_hw(kh, dilation_h - 1, 0, 0, 0);
+  int h_after = calc_dilute_hw(ih, 0, 0, 0, 0);
+  int in = 1;
+  // int h_after = calc_dilute_hw(ih, p->ins_h, p->ins_last_h, p->pad_top, p->pad_bottom);
+  // int w_after = calc_dilute_hw(iw, p->ins_w, p->ins_last_w, p->pad_left, p->pad_right);
+  int ret = _get_dup_shape_same_pad(ctx, in, ic, h_after, iw, d_kh, stride_h, info.npu_num,
+                                    tl_load_shape, new_tl_ifmap_stride, new_tg_ifmap_shape,
+                                    new_tg_ifmap_stride, fmt, fmt);
+
+  if (ret == -1) {
+    return ret;
+  }
+
+  new_tl_weight_shape->n = 1;
+  new_tl_weight_shape->c = tl_load_shape->c;
+  new_tl_weight_shape->h = kh;
+  new_tl_weight_shape->w = kw;
+
+  new_tl_bias_shape->n = 2;
+  new_tl_bias_shape->c = tl_load_shape->c;
+  new_tl_bias_shape->h = 1;
+  new_tl_bias_shape->w = 1;
+
+  int pad_h = get_same_pad(tl_load_shape->h, stride_h, kh);
+  // int no_pad_h = tl_load_shape->h;
+
+  // reserve for padding
+  new_tg_ifmap_shape->h += pad_h;
+  tl_load_shape->h += pad_h;
+
+  cvk_tl_stride_t s = ctx->ops->tl_default_stride(ctx, *tl_load_shape, fmt, eu_align);
+
+  new_tl_ifmap_stride->n = s.n;
+  new_tl_ifmap_stride->c = s.c;
+  new_tl_ifmap_stride->h = s.h;
+  new_tl_ifmap_stride->w = s.w;
+
+  // TODO: verity ins_x
+  int oh = pooling_oh(0, 0, 0, 0, stride_h, tl_load_shape->h, kh, dilation_h);
+  int ow = pooling_ow(0, 0, pad_left, pad_right, stride_w, tl_load_shape->w, kw, dilation_w);
+
+#ifdef DBG
+  printf("new oh/ow pad_h is %d/%d %d\n", oh, ow, pad_h);
+#endif
+  new_tl_ofmap_shape->n = in;
+  new_tl_ofmap_shape->c = tl_load_shape->c;
+  new_tl_ofmap_shape->h = oh;
+  new_tl_ofmap_shape->w = ow;
+
+  return ret;
+}
diff --git a/cvimath/src/tiu_sigmoid.c b/cvimath/src/tiu_sigmoid.c
new file mode 100644
index 000000000..1bd61e99d
--- /dev/null
+++ b/cvimath/src/tiu_sigmoid.c
@@ -0,0 +1,266 @@
+/**
+ * implement Linear interpolation search
+ *
+ * we need to pass 2 table, one is answer(lut_answer), another is slope with
+ * anwser(lut_answer_slope),
+ *
+ * for example, we want to get x value
+ * +------+----+
+ * x0     x    x1
+ *
+ * the [Linear interpolation defined] (https://en.wikipedia.org/wiki/Linear_interpolation) as
+ * flowing:
+ *
+ * part C  part A                     part B
+ * +--+    +---+           +----------------------------------------+
+ *
+ * p(x) =  f(x0)     +     ( (f(x1) - f(x0)) / (x1 - x0) ) * (x - x0)
+ *
+ *         +---+           +-----------------------------+
+ *        lut_answer              lut_answer_slope
+ */
+
+#include <cvimath_internal.h>
+#include "gen_lut.h"  // NOLINT
+
+//#define DBG
+/*
+ * NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type
+ *
+ * \tl_buf tmp buffer, the shape MUST be same with \tl_ifmap
+ * \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used
+ */
+int cvm_emit_sigmoid(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_buf,
+                     cvk_tl_t* tl_table_answer, cvk_tl_t* tl_table_answer_slope,
+                     cvk_tl_t* OUT tl_ofmap_bf16, float scale) {
+  cvm_table_check(tl_ifmap, tl_table_answer, tl_table_answer_slope, tl_buf);
+
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+
+  cvk_tl_shape_t tl_ofmap_A_idx_int8_shape = {1, tl_buf->shape.c, tl_buf->shape.h * tl_buf->shape.w,
+                                              1};
+
+  cvk_tdma_l2l_tensor_copy_param_t p10;
+
+  // scale input for remap its idx(-x~x) to (-127~127), dirty tl_ifmap
+  cvk_tiu_mul_param_t p1;
+  p1.res_high = NULL;
+  p1.res_low = tl_ifmap;
+  p1.a = tl_ifmap;
+  p1.b_is_const = 1;
+  p1.b_const.val = convert_fp32_bf16(scale);
+  p1.rshift_bits = 0;
+  p1.relu_enable = 0;
+
+  ctx->ops->tiu_mul(ctx, &p1);
+
+  // <! get idx from bf16->int8
+  // save by stride
+  memset(&p10, 0x00, sizeof(cvk_tdma_l2l_tensor_copy_param_t));
+  cvk_tl_t dst;
+  memcpy(&dst, tl_ofmap_bf16, sizeof(cvk_tl_t));
+  dst.fmt = CVK_FMT_I8;
+  dst.shape = tl_ofmap_A_idx_int8_shape;
+  // dst.stride = ctx->ops->tl_default_stride(ctx, dst.shape, /*eu_align*/ 1,
+  // dst.fmt);
+  dst.stride = ctx->ops->tl_default_stride(ctx, dst.shape, dst.fmt, CTRL_NULL);
+  dst.stride.h = dst.stride.h * 2;
+  dst.int8_rnd_mode = 1;
+  p10.dst = &dst;
+  p10.src = tl_ifmap;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+  dst.int8_rnd_mode = 0;  // reset
+
+  // <! int8 to fb16 format cus for sub use, sub MUST in the same format
+  memset(&p10, 0x00, sizeof(cvk_tdma_l2l_tensor_copy_param_t));
+  p10.dst = tl_buf;  //<! bf16
+  p10.src = &dst;
+  ctx->ops->tdma_l2l_bf16_tensor_copy(ctx, &p10);
+
+  // <! sub, diff base , a - b
+  // (x - x0)
+  cvk_tiu_sub_param_t p5;
+  p5.res_high = 0;
+  p5.res_low = tl_ifmap;
+  p5.a_high = 0;
+  p5.a_low = tl_ifmap;
+  p5.b_high = 0;
+  p5.b_low = tl_buf;
+  p5.rshift_bits = 0;
+
+  ctx->ops->tiu_sub(ctx, &p5);
+
+  // get f(x0) and slope(x)
+  // reshape, 16->16
+  dst.fmt = fmt;
+  dst.shape = tl_buf->shape;
+  dst.stride = tl_buf->stride;
+
+  // <! get slope by index
+  // <! ( (f(x1) - f(x0)) / (x1 - x0) )
+  // <! TIU MUST with same shape and stride, we leverage output map shape and stride
+  cvk_tiu_lookup_table_param_t p12;
+  memset(&p12, 0x0, sizeof(cvk_tiu_lookup_table_param_t));
+  p12.ofmap = tl_buf;
+  p12.ifmap = &dst;
+  p12.table = tl_table_answer_slope;
+  ctx->ops->tiu_lookup_table(ctx, &p12);
+
+  // base f(x0)
+  memset(&p12, 0x0, sizeof(cvk_tiu_lookup_table_param_t));
+  p12.ofmap = tl_ofmap_bf16;
+  p12.ifmap = &dst;
+  p12.table = tl_table_answer;
+  ctx->ops->tiu_lookup_table(ctx, &p12);
+
+  // <! mac
+  // <! part A + part B, a *.b.b + res = res
+  cvk_tiu_mac_param_t p2;
+  p2.res_high = 0;
+  p2.res_low = tl_ofmap_bf16;
+  p2.res_is_int8 = 0;
+  p2.a = tl_ifmap;
+  p2.b_is_const = 0;
+  p2.b = tl_buf;
+  p2.lshift_bits = 0;  // lshift_bits;
+  p2.rshift_bits = 0;  // rshift_bits;
+  p2.relu_enable = 0;
+
+  ctx->ops->tiu_mac(ctx, &p2);
+  return 0;
+}
+
+static double _gen_sigmoid(float x) { return 1.0 / (1.0 + exp(-(x))); }
+
+double* cvm_gen_sigmoid_double() {
+  int table_hw = cvm_table_hw();
+  return (double*)malloc(sizeof(double) * table_hw);
+}
+
+void cvm_free_sigmoid_double(double* sigmode_hw) { free(sigmode_hw); }
+
+void cvm_gen_sigmoid(uint16_t* table_data, cvk_tl_shape_t* table_shape, double* sigmode_hw,
+                     float scale, int range_start) {
+  // S(x) = 1 / (1 + (e^-x))
+  //<! 32*8 table, duplicate `channel` times;
+  uint64_t idx = 0;
+  ASSERT(is_1880v2_tbl_shape(table_shape));
+
+  int half = half_h_table();
+  int table_hw = cvm_table_hw();
+
+  // prepare channel 0
+  // x [0, 127]
+  // we re-scale [-8, 8] into 256
+  for (int i = 0; i < half; i++) {
+    float _idx = idx / scale;
+    double s = _gen_sigmoid(_idx);
+    sigmode_hw[idx] = s;
+    table_data[idx] = convert_fp32_bf16((float)s);
+#ifdef GDB
+    printf("t [%lu] is %f[%d], 0x%x fp is %f d is %.8lf, input is %f\n", idx,
+           convert_bf16_fp32(table_data[idx]), i, table_data[idx], (float)s, s, _idx);
+#endif
+    idx++;
+  }
+
+  // x = -128
+  double s = _gen_sigmoid(range_start);
+  sigmode_hw[idx] = s;
+  table_data[idx] = convert_fp32_bf16((double)s);
+#ifdef GDB
+  printf("t [%lu] is %f[%d], 0x%x fp is %f d is %.8lf input is %d\n", idx,
+         convert_bf16_fp32(table_data[idx]), -128, table_data[idx], (float)s, s, range_start);
+#endif
+  idx++;
+
+  // x [-128~-1], 2's complement
+  for (int i = 1; i < half; i++) {
+    float _idx = (i) / scale;
+    double s = _gen_sigmoid(range_start + _idx);
+    sigmode_hw[idx] = s;
+    table_data[idx] = convert_fp32_bf16((double)s);
+#ifdef GDB
+    printf("t [%lu] is %f[%d], 0x%x fp is %f d is %.8lf input is %f\n", idx,
+           convert_bf16_fp32(table_data[idx]), -127 + i, table_data[idx], (float)s, s,
+           range_start + _idx);
+#endif
+    idx++;
+  }
+
+  // duplicate channel #1 to #31
+
+  // TODO: tensor copy
+  for (uint32_t i = 1; i < table_shape->c; i++) {
+    memcpy(&table_data[i * table_hw], &table_data[0], sizeof(uint16_t) * table_hw);
+  }
+}
+
+float cvm_sigmoid_scale(int range_start, int range_end) {
+  int table_hw = cvm_table_hw();
+  return table_hw / (1.0 * abs(range_start - range_end));  // 256 / 16 = 16
+}
+
+void cvm_gen_sigmoid_slope(uint16_t* OUT table_slope, cvk_tl_shape_t* table_shape,
+                           double* sigmode_hw, float scale, int range_start, int range_end) {
+  ASSERT(is_1880v2_tbl_shape(table_shape));
+
+  int half = half_h_table();
+  int table_hw = cvm_table_hw();
+
+  for (int i = 0; i < table_hw; i++) {
+    double x0 = sigmode_hw[i];
+    // double x1 = sigmode_hw[i + 1];
+    double x1;
+    double delta = 1.0;
+    if (i == half - 1) {
+      //<! slope[127] means f(127)~f(128)
+      double f = _gen_sigmoid(range_end);
+      // uint16_t bf16 = convert_fp32_bf16(f);
+      // x1 = convert_bf16_fp32(bf16);
+      x1 = f;
+    } else if (i == half) {
+      // 128 index mean x1 is -129 and x0 is -128
+      x1 = _gen_sigmoid(range_start - 1 / scale);
+      delta = -1.0;
+    } else if (i > half) {
+      x0 = sigmode_hw[i];
+      x1 = sigmode_hw[i - 1];
+      delta = -1.0;
+    } else {
+      // for avoid -fsanitize=address check
+      x1 = sigmode_hw[i + 1];
+    }
+    double s = (x1 - x0) / delta;  // x1 already scale up
+    table_slope[i] = convert_fp32_bf16((float)s);
+#ifdef GDB
+    printf("slope table [%u] = (bf16 %f double %.8lf float %f), 0x%x, %.8lf - %.8lf(%.8lf)\n", i,
+           convert_bf16_fp32(table_slope[i]), s, (float)s, table_slope[i], x1, x0, x1 - x0);
+#endif
+  }
+
+  // duplicate channel #1 to #31
+
+  // TODO: tensor copy
+  for (uint64_t i = 1; i < table_shape->c; i++) {
+    memcpy(&table_slope[table_hw * i], &table_slope[0], sizeof(uint16_t) * table_hw);
+  }
+}
+
+void cvm_sigmoid_tbl(uint16_t* sigmoid_table_data, uint16_t* sigmoid_table_data_slope,
+                     cvk_tl_shape_t* table_shape, int range_start, int range_end) {
+  ASSERT(sigmoid_table_data);
+  ASSERT(sigmoid_table_data_slope);
+  ASSERT(table_shape);
+
+  double* sigmode_hw = cvm_gen_sigmoid_double();
+
+  float scale = cvm_sigmoid_scale(range_start, range_end);
+
+  cvm_gen_sigmoid(sigmoid_table_data, table_shape, sigmode_hw, scale, range_start);
+
+  cvm_gen_sigmoid_slope(sigmoid_table_data_slope, table_shape, sigmode_hw, scale, range_start,
+                        range_end);
+
+  cvm_free_sigmoid_double(sigmode_hw);
+}
diff --git a/cvimath/src/tiu_sqrt.c b/cvimath/src/tiu_sqrt.c
new file mode 100644
index 000000000..1977a49ab
--- /dev/null
+++ b/cvimath/src/tiu_sqrt.c
@@ -0,0 +1,121 @@
+/**
+ */
+#include <cvimath_internal.h>
+#include "gen_lut.h"  // NOLINT
+
+//#define DBG
+/*
+ * NOTICE: it could occupy 2 lookup table size which shape is <1,32,32,8> with bf16 data type
+ *
+ * \tl_buf tmp buffer, the shape MUST be same with \tl_ifmap
+ * \tl_ofmap_bf16 result as bf16, MUST given for tmp buffer used
+ */
+int cvm_emit_sqrt(cvk_context_t* ctx, cvk_tl_t* IN tl_ifmap, cvk_tl_t* IN tl_buf,
+                  cvk_tl_t* tbl_answer, cvk_tl_t* tbl_answer_mantissa,
+                  cvk_tl_t* OUT tl_ofmap_bf16) {
+  return cvm_lut_exp_mantissa(ctx, tl_ifmap, tl_buf, tbl_answer, tbl_answer_mantissa,
+                              tl_ofmap_bf16);
+}
+
+static double _gen_sqrt(int base, int p) {
+  // y = x ^ 0.5
+  double f = (double)(pow(base, p * 0.5));
+
+  if (isnan(f)) {
+    ASSERT(0);
+  }
+  return f;
+}
+
+void cvm_gen_sqrt(uint16_t* table_data, cvk_tl_shape_t* table_shape) {
+  ASSERT(is_1880v2_tbl_shape(table_shape));
+
+  int exp_start = cvm_exp_start();
+  int half = half_h_table();
+  int table_hw = cvm_table_hw();
+  uint64_t idx = 0;
+
+  // prepare channel 0
+  double s = 0.0;
+  table_data[idx] = convert_fp32_bf16(s);  // 0^0.5 = 0
+#ifdef DBG
+  printf("t [%lu] is %f(%.8lf)[idx:%f][2^%f] bf %x\n", idx, convert_bf16_fp32(table_data[idx]), s,
+         (float)exp_start, (float)(exp_start / 2), table_data[idx]);
+#endif
+  idx++;
+
+  // > 0, exp from 0 -62 -61 ..  62  63
+  for (int i = 0; i < half; i++) {
+    int shift = (exp_start + i);
+    bool is_odd = (shift % 2);
+    float exp = shift;
+    if (is_odd) {
+      exp = exp - 1;
+    }
+
+    double s = _gen_sqrt(2, exp);
+    table_data[idx] = convert_fp32_bf16(s);
+#ifdef DBG
+    printf("t [%lu] is %f [idx:%f][2^%f(%f)] bf %x\n", idx, convert_bf16_fp32(table_data[idx]),
+           (float)(exp_start + i), exp / 2, (exp_start + i) / 2.0, table_data[idx]);
+#endif
+    idx++;
+  }
+
+  //// idx = 127 dont care
+  // duplicate channel #1 to #channel
+  // TODO: tensor copy
+
+  for (uint32_t i = 1; i < table_shape->c; i++) {
+    memcpy(&table_data[i * table_hw], &table_data[0], sizeof(uint16_t) * table_hw);
+  }
+}
+
+void cvm_gen_sqrt_mantissa(uint16_t* OUT table_mantissa, cvk_tl_shape_t* table_shape) {
+  ASSERT(is_1880v2_tbl_shape(table_shape));
+
+  uint32_t half = half_h_table();
+  int table_hw = cvm_table_hw();
+
+  int idx = 0;
+  double d;
+  for (uint32_t i = 0; i < half; i++) {
+    d = 1 + i * 1 / 128.0;
+    d = (double)pow(d, 0.5);
+    table_mantissa[128 + idx] = convert_fp32_bf16(d);
+#ifdef DBG
+    // printf(", [%u] is %lf\n", i+128, d);
+#endif /* ifdef DBG */
+
+    // 13=2^3x1.625=(2^2)x(2^1x1.625)
+    d = 2 * (1 + i * 1 / 128.0);
+    d = (double)pow(d, 0.5);
+    table_mantissa[idx] = convert_fp32_bf16(d);
+#ifdef DBG
+    // printf("mantissa [%u] is %lf", i, d);
+#endif /* ifdef DBG */
+    idx++;
+  }
+#ifdef DBG
+  for (uint32_t i = 0; i < 2 * half; i++) {
+    printf("mantissa [%u] is %lf, 0x%x\n", i, convert_bf16_fp32(table_mantissa[i]),
+           table_mantissa[i]);
+  }
+#endif /* ifdef DBG */
+
+  // duplicate channel #1 to #31
+  // TODO: tensor copy
+  for (uint64_t i = 1; i < table_shape->c; i++) {
+    memcpy(&table_mantissa[table_hw * i], &table_mantissa[0], sizeof(uint16_t) * table_hw);
+  }
+}
+
+void cvm_sqrt_tbl(uint16_t* sqrt_table_data, uint16_t* sqrt_table_data_mantissa,
+                  cvk_tl_shape_t* table_shape) {
+  ASSERT(sqrt_table_data);
+  ASSERT(sqrt_table_data_mantissa);
+  ASSERT(table_shape);
+
+  cvm_gen_sqrt(sqrt_table_data, table_shape);
+  cvm_gen_sqrt_mantissa(sqrt_table_data_mantissa, table_shape);
+}
diff --git a/cvimath/src/tiu_upsample.c b/cvimath/src/tiu_upsample.c
new file mode 100644
index 000000000..4d924f02b
--- /dev/null
+++ b/cvimath/src/tiu_upsample.c
@@ -0,0 +1,54 @@
+#include <cvimath_internal.h>
+#include "gen_lut.h"
+
+int cvm_upsample2d(cvk_context_t* ctx, cvk_tl_t* tl_input, cvk_tl_t* tl_weight,
+                   cvk_tl_t* tl_output) {
+  int ih = tl_input->shape.h;
+  int iw = tl_input->shape.w;
+  int sh = tl_weight->shape.h;
+  int sw = tl_weight->shape.w;
+  int kh = sh;
+  int kw = sw;
+
+  int pt = 0;
+  int pl = 0;
+  int pr = 0;
+  int pb = 0;
+  int dh = 1;
+  int dw = 1;
+
+  int ow = tl_output->shape.w;
+  int oh = tl_output->shape.h;
+  int kh_ext = (kh - 1) * dh + 1;
+  int kw_ext = (kw - 1) * dw + 1;
+  int ins_h = sh - 1;
+  int ins_w = sw - 1;
+  int pad_t = kh_ext - pt - 1;
+  int pad_l = kw_ext - pl - 1;
+  int pad_b = oh + pb - (ih - 1) * sh - 1;
+  int pad_r = ow + pr - (iw - 1) * sw - 1;
+
+  cvk_tiu_depthwise_pt_convolution_param_t param = {0};
+  param.ofmap = tl_output;
+  param.ifmap = tl_input;
+  param.weight = tl_weight;
+  param.bias = 0;
+  param.ins_h = ins_h;
+  param.ins_last_h = 0;
+  param.ins_w = ins_w;
+  param.ins_last_w = 0;
+  param.stride_h = 1;
+  param.stride_w = 1;
+  param.dilation_h = 1;
+  param.dilation_w = 1;
+  param.pad_top = pad_t;
+  param.pad_bottom = pad_b;
+  param.pad_left = pad_l;
+  param.pad_right = pad_r;
+  param.relu_enable = 0;
+  param.ins_val = 0;  // symmetric quantization
+  param.ins_fp = 0;   // symmetric quantization
+  ctx->ops->tiu_pt_depthwise_convolution(ctx, &param);
+
+  return 0;
+}
diff --git a/cvimath/src/util.c b/cvimath/src/util.c
new file mode 100644
index 000000000..d19a9cb43
--- /dev/null
+++ b/cvimath/src/util.c
@@ -0,0 +1,270 @@
+#include <assert.h>
+#include <cvikernel/cvikernel.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "test_cvikernel_util.h"
+
+#define container_of(ptr, type, member)                \
+  ({                                                   \
+    const typeof(((type *)0)->member) *__mptr = (ptr); \
+    (type *)((char *)__mptr - offsetof(type, member)); \
+  })
+
+typedef struct {
+  cvk_tg_t tg;
+  CVI_RT_MEM mem;
+} test_tg_wrapper_t;
+
+typedef struct {
+  cvk_mg_t mg;
+  CVI_RT_MEM mem;
+} test_mg_wrapper_t;
+
+void test_submit_comp(CVI_RT_HANDLE *bm_ctx, cvk_context_t *cvk_ctx) {
+  (void)cvk_ctx;
+  (void)bm_ctx;
+  CVI_RT_Submit(cvk_ctx);
+}
+
+cvk_tg_t *test_alloc_tg_mem_comp(CVI_RT_HANDLE *bm_ctx, cvk_context_t *cvk_ctx,
+                                 cvk_tg_shape_t shape, cvk_fmt_t fmt) {
+  CVI_RT_HANDLE ctx = (CVI_RT_HANDLE)*bm_ctx;
+  int alloc_sz = tg_shape_size(&shape) * bytesize_of_fmt(fmt);
+
+  test_tg_wrapper_t *w = (test_tg_wrapper_t *)malloc(sizeof(*w));
+  assert(w && "Expected allocated tg wrapper");
+
+  w->tg.base_reg_index = 0;
+  w->mem = CVI_RT_MemAlloc(ctx, alloc_sz);
+  w->tg.start_address = CVI_RT_MemGetPAddr(w->mem);
+  w->tg.fmt = fmt;
+  w->tg.shape = shape;
+  w->tg.stride = cvk_ctx->ops->tg_default_stride(cvk_ctx, shape, fmt);
+
+  return &w->tg;
+}
+
+cvk_mg_t *test_alloc_mg_mem_comp(CVI_RT_HANDLE *bm_ctx, cvk_mg_shape_t s, cvk_fmt_t fmt) {
+  int alloc_sz = mg_shape_size(&s) * bytesize_of_fmt(fmt);
+  CVI_RT_HANDLE ctx = (CVI_RT_HANDLE)*bm_ctx;
+
+  test_mg_wrapper_t *w = (test_mg_wrapper_t *)malloc(sizeof(*w));
+  w->mem = CVI_RT_MemAlloc(ctx, alloc_sz);
+
+  w->mg.base_reg_index = 0;
+  w->mg.start_address = CVI_RT_MemGetPAddr(w->mem);
+  w->mg.shape = s;
+  w->mg.fmt = fmt;
+  w->mg.stride.row = s.col * bytesize_of_fmt(fmt);
+
+  return &w->mg;
+}
+
+void test_free_tg_mem_comp(CVI_RT_HANDLE *ctx, const cvk_tg_t *tg) {
+  test_tg_wrapper_t *w = container_of(tg, test_tg_wrapper_t, tg);
+  CVI_RT_MemFree(*ctx, w->mem);
+
+  free(w);
+}
+
+void test_free_mg_mem_comp(CVI_RT_HANDLE *ctx, const cvk_mg_t *mg) {
+  test_mg_wrapper_t *w = container_of(mg, test_mg_wrapper_t, mg);
+  CVI_RT_MemFree(*ctx, w->mem);
+
+  free(w);
+}
+
+void test_put_tg_mem_comp(CVI_RT_HANDLE *bm_ctx, const cvk_tg_t *tg, uint8_t data[]) {
+  test_tg_wrapper_t *w = container_of(tg, test_tg_wrapper_t, tg);
+  CVI_RT_MemCopyS2D(*bm_ctx, w->mem, data);
+}
+
+void test_put_mg_mem_comp(CVI_RT_HANDLE *ctx, const cvk_mg_t *mg, uint8_t data[]) {
+  test_mg_wrapper_t *w = (typeof(w))mg;
+  CVI_RT_MemCopyS2D(*ctx, w->mem, data);
+}
+
+uint8_t *test_get_tg_mem_comp(CVI_RT_HANDLE *bm_ctx, const cvk_tg_t *tg) {
+  cvk_tg_shape_t s = tg->shape;
+
+  int data_type_size = 1;
+  if (tg->fmt == CVK_FMT_BF16) {
+    data_type_size = 2;
+  }
+
+  uint32_t size = s.n * s.c * s.h * s.w * data_type_size;
+  uint8_t *data = (uint8_t *)malloc(size);
+  assert(data && "Expect allocated data for get tg mem");
+
+  test_tg_wrapper_t *w = container_of(tg, test_tg_wrapper_t, tg);
+  CVI_RT_MemCopyD2S(*bm_ctx, data, w->mem);
+
+  return data;
+}
+
+uint8_t *test_get_mg_mem_comp(CVI_RT_HANDLE *ctx, const cvk_mg_t *mg) {
+  cvk_mg_shape_t s = mg->shape;
+  uint32_t size = s.row * s.col * (mg->fmt == CVK_FMT_BF16 ? 2 : 1);
+  uint8_t *data = (uint8_t *)malloc(size);
+  assert(data && "Expect allocated data for get mg mem");
+
+  test_mg_wrapper_t *w = container_of(mg, test_mg_wrapper_t, mg);
+  CVI_RT_MemCopyD2S(*ctx, data, w->mem);
+
+  return data;
+}
+
+uint8_t *test_get_tensor_l2g_comp(CVI_RT_HANDLE *bm_ctx, cvk_context_t *cvk_ctx,
+                                  const cvk_tl_t *tl) {
+  cvk_tg_shape_t s;
+  s.n = tl->shape.n;
+  s.c = tl->shape.h;
+  s.h = tl->shape.w;
+  s.w = tl->shape.c;
+  cvk_tg_t *tg = test_alloc_tg_mem_comp(bm_ctx, cvk_ctx, s, tl->fmt);
+
+  cvk_tdma_l2g_tensor_copy_param_t p;
+  p.src = tl;
+  p.dst = tg;
+
+  if (tl->fmt == CVK_FMT_BF16) {
+    cvk_ctx->ops->tdma_l2g_bf16_tensor_copy(cvk_ctx, &p);
+  } else {
+    cvk_ctx->ops->tdma_l2g_tensor_copy(cvk_ctx, &p);
+  }
+  test_submit_comp(bm_ctx, cvk_ctx);
+  uint8_t *data = test_get_tg_mem_comp(bm_ctx, tg);
+
+  test_free_tg_mem_comp(bm_ctx, tg);
+  return data;
+}
+
+uint8_t *test_get_matrix_l2g_comp(CVI_RT_HANDLE *ctx, cvk_context_t *cvk_ctx, const cvk_ml_t *ml) {
+  cvk_mg_shape_t s;
+  s.row = ml->shape.n;
+  s.col = ml->shape.col;
+  cvk_mg_t *mg = test_alloc_mg_mem_comp(ctx, s, ml->fmt);
+
+  cvk_tdma_l2g_matrix_copy_param_t p;
+  p.src = ml;
+  p.dst = mg;
+
+  if (ml->fmt == CVK_FMT_BF16) {
+    cvk_ctx->ops->tdma_l2g_bf16_matrix_copy(cvk_ctx, &p);
+  } else {
+    cvk_ctx->ops->tdma_l2g_matrix_copy(cvk_ctx, &p);
+  }
+
+  test_submit_comp(ctx, cvk_ctx);
+
+  uint8_t *data = test_get_mg_mem_comp(ctx, mg);
+
+  test_free_mg_mem_comp(ctx, mg);
+
+  return data;
+}
+
+void test_put_tensor_g2l_comp(CVI_RT_HANDLE *bm_ctx, cvk_context_t *cvk_ctx, const cvk_tl_t *tl,
+                              uint8_t data[]) {
+  cvk_tg_shape_t tg_shape;
+  tg_shape.n = tl->shape.n;
+  tg_shape.c = tl->shape.c;
+  tg_shape.h = tl->shape.h;
+  tg_shape.w = tl->shape.w;
+
+  cvk_tg_t *tg = test_alloc_tg_mem_comp(bm_ctx, cvk_ctx, tg_shape, tl->fmt);
+
+  cvk_tdma_g2l_tensor_copy_param_t p;
+  p.src = tg;
+  p.dst = tl;
+
+  test_put_tg_mem_comp(bm_ctx, tg, data);
+
+  if (tl->fmt == CVK_FMT_BF16) {
+    cvk_ctx->ops->tdma_g2l_bf16_tensor_copy(cvk_ctx, &p);
+  } else {
+    cvk_ctx->ops->tdma_g2l_tensor_copy(cvk_ctx, &p);
+  }
+  test_submit_comp(bm_ctx, cvk_ctx);
+
+  test_free_tg_mem_comp(bm_ctx, tg);
+}
+
+void test_put_matrix_g2l_comp(CVI_RT_HANDLE *bm_ctx, cvk_context_t *cvk_ctx, const cvk_ml_t *ml,
+                              uint8_t data[]) {
+  cvk_fmt_t mg_data_format = ml->fmt;
+  cvk_mg_shape_t s;
+  s.row = ml->shape.n;
+  s.col = ml->shape.col;
+  cvk_mg_t *mg = test_alloc_mg_mem_comp(bm_ctx, s, mg_data_format);
+
+  cvk_tdma_g2l_matrix_copy_param_t p;
+  p.src = mg;
+  p.dst = ml;
+
+  test_put_mg_mem_comp(bm_ctx, mg, data);
+  if (ml->fmt == CVK_FMT_BF16) {
+    cvk_ctx->ops->tdma_g2l_bf16_matrix_copy(cvk_ctx, &p);
+  } else {
+    cvk_ctx->ops->tdma_g2l_matrix_copy(cvk_ctx, &p);
+  }
+
+  test_submit_comp(bm_ctx, cvk_ctx);
+
+  test_free_mg_mem_comp(bm_ctx, mg);
+}
+
+cvk_mg_t *test_put_matrix_g(CVI_RT_HANDLE *bm_ctx, const cvk_mg_shape_t s, cvk_fmt_t mg_data_format,
+                            uint8_t data[]) {
+  cvk_mg_t *mg = test_alloc_mg_mem_comp(bm_ctx, s, mg_data_format);
+
+  test_put_mg_mem_comp(bm_ctx, mg, data);
+  return mg;
+}
+
+cvk_tl_t *test_alloc_tl(cvk_context_t *cvk_ctx, cvk_tl_shape_t shape, cvk_fmt_t fmt, int eu_align) {
+  cvk_tl_t *tl = cvk_ctx->ops->lmem_alloc_tensor(cvk_ctx, shape, fmt, eu_align);
+  return tl;
+}
+
+void test_free_tl(cvk_context_t *cvk_ctx, const cvk_tl_t *t) {
+  cvk_ctx->ops->lmem_free_tensor(cvk_ctx, t);
+}
+
+#define CNV_SCALAR_C_ALIGN (0x1000)
+inline uint64_t cnvAlign64(const uint64_t length, const uint64_t align) {
+  uint64_t stride = (uint64_t)(length / align) * align;
+  if (stride < length) {
+    stride += align;
+  }
+  return stride;
+}
+uint8_t *test_get_vp_addr(bmctx_t *ctx, AddrInfo *pAddrInfo)
+{
+
+  if(pAddrInfo->vir_addr){
+    test_free_vp_addr(ctx, pAddrInfo);
+  }
+  pAddrInfo->mem = bmmem_device_alloc_raw(*ctx, pAddrInfo->size_bytes);
+  pAddrInfo->vir_addr = (uint8_t *)bmmem_device_v_addr(pAddrInfo->mem);;
+  pAddrInfo->phy_addr = bmmem_device_addr(pAddrInfo->mem);
+
+
+  uint64_t new_paddr = cnvAlign64(pAddrInfo->phy_addr, CNV_SCALAR_C_ALIGN);
+  uint64_t offset = new_paddr - pAddrInfo->phy_addr;
+  pAddrInfo->phy_addr = new_paddr;
+  pAddrInfo->vir_addr += offset;
+
+  return pAddrInfo->vir_addr;
+}
+
+void test_free_vp_addr(bmctx_t *ctx,  AddrInfo *pAddrInfo){
+
+  bmmem_device_free(*ctx, pAddrInfo->mem);
+  pAddrInfo->phy_addr = -1;
+  pAddrInfo->vir_addr = NULL;
+  //pAddrInfo->size_bytes = 0;
+
+}
diff --git a/cvimath/tests/CMakeLists.txt b/cvimath/tests/CMakeLists.txt
new file mode 100644
index 000000000..feacbdcd0
--- /dev/null
+++ b/cvimath/tests/CMakeLists.txt
@@ -0,0 +1,34 @@
+project(cvimath)
+
+include(CTest)
+
+include_directories("${CMAKE_CURRENT_SOURCE_DIR}/include")
+
+file(GLOB _TEST_UTILS "${CMAKE_CURRENT_SOURCE_DIR}/common/*c")
+
+# cvi1835 test
+include_directories(
+    ${CMAKE_SOURCE_DIR}/include
+    ${CMAKE_SOURCE_DIR}/src
+    )
+file(GLOB CVI1835_TESTS cvi1835/*.cpp)
+
+# FIXME: repair test case
+list(FILTER CVI1835_TESTS EXCLUDE REGEX ".*atan2.*")
+list(FILTER CVI1835_TESTS EXCLUDE REGEX ".*depthwise_reshape_same.*")
+
+foreach(TEST_SRC ${CVI1835_TESTS})
+    get_filename_component(TEST_NAME ${TEST_SRC} NAME_WE)
+
+    add_executable(${TEST_NAME} ${_TEST_UTILS} ${TEST_SRC})
+    target_link_libraries(${TEST_NAME} ${TPU_KERNEL_LIB} ${TEST_LIBS})
+    set_target_properties(${TEST_NAME} PROPERTIES COMPILE_FLAGS "-Werror -Wall -Wextra")
+    install(TARGETS ${TEST_NAME} DESTINATION bin)
+
+    add_test(${TEST_NAME} ${TEST_NAME} ctest_test)
+endforeach()
+
+#add_library(${PROJECT_NAME} SHARED ${SRC})
+#target_link_libraries(${PROJECT_NAME} ${TPU_KERNEL_LIB})
+#install(TARGETS ${PROJECT_NAME} DESTINATION tests)
+
diff --git a/cvimath/tests/common/test_native_ref.c b/cvimath/tests/common/test_native_ref.c
new file mode 100644
index 000000000..f06db908c
--- /dev/null
+++ b/cvimath/tests/common/test_native_ref.c
@@ -0,0 +1,980 @@
+#include <assert.h>
+#include <errno.h>
+#include <float.h>
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include <test_native_ref.h>
+
+#define math_min(x, y) ((x) < (y) ? (x) : (y))
+#define math_max(x, y) ((x) > (y) ? (x) : (y))
+
+typedef uint8_t uint8_t;
+typedef uint16_t uint16_t;
+typedef uint32_t uint32_t;
+typedef uint64_t uint64_t;
+
+typedef int8_t int8_t;
+typedef int16_t int16_t;
+typedef int32_t int32_t;
+typedef int64_t s64;
+typedef uint32_t bmerr_t;
+
+#define BM_SUCCESS 0               // The operation was successful
+#define BM_ERR_AGAIN 1             // Not ready yet
+#define BM_ERR_FAILURE 2           // General failure
+#define BM_ERR_TIMEOUT 3           // Timeout
+#define BM_ERR_UNINITIALIZED 4     // Uninitialzed
+#define BM_ERR_INVALID_ARGUMENT 5  // Arguments invalid
+#define BM_ERR_NOMEM 6             // Not enough memory
+#define BM_ERR_DATA 7              // Data error
+#define BM_ERR_BUSY 8              // Busy
+#define BM_ERR_NOT_SUPPORTED 9     // Not supported yet
+
+typedef uint32_t BLOB_OP;
+#define BLOB_ADD 0
+#define BLOB_SUB 1
+#define BLOB_MUL 2
+#define BLOB_DIV 3
+#define BLOB_INVALID 4
+
+static inline int calc_offset(int *shape, int *offset) {
+  return ((offset[0] * shape[1] + offset[1]) * shape[2] + offset[2]) * shape[3] + offset[3];
+}
+
+static int index_get(int h, int w1, int w2) { return h * w1 + w2; }
+
+int array_cmp_float_rel(const char *const info, float *p_exp, float *p_got, int count,
+                        float delta) {
+  int idx = 0;
+  for (idx = 0; idx < count; idx++) {
+    if (math_max(fabs(p_exp[idx]), fabs(p_got[idx])) > 1.0) {
+      // compare rel
+      if (math_min(fabs(p_exp[idx]), fabs(p_got[idx])) < 1e-20) {
+        printf("%s rel error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]);
+        if (isnan(p_exp[idx]) && isnan(p_got[idx])) {
+          printf("both exp and got are NAN");
+          return 0;
+        }
+        return -1;
+      }
+      if (fabs(p_exp[idx] - p_got[idx]) > delta * math_min(fabs(p_exp[idx]), fabs(p_got[idx]))) {
+        printf("%s rel error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]);
+        if (isnan(p_exp[idx]) && isnan(p_got[idx])) {
+          printf("both exp and got are NAN");
+          return 0;
+        }
+        return -1;
+      }
+    } else {
+      if (fabs(p_exp[idx] - p_got[idx]) > delta) {
+        printf("%s abs error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]);
+        if (isnan(p_exp[idx]) && isnan(p_got[idx])) {
+          printf("both exp and got are NAN");
+          return 0;
+        }
+        return -1;
+      }
+    }
+
+    if (isnan(p_got[idx]) && !isnan(p_exp[idx])) {
+      printf("%s, found nans idx %d\n", info, idx);
+      printf("floating from exp %.10f got %.10f\n", p_exp[idx], p_got[idx]);
+      IF_VAL exp, got;
+      exp.fval = p_exp[idx];
+      got.fval = p_got[idx];
+      printf("hex form exp %8.8x got %8.8x\n", exp.ival, got.ival);
+      return -2;
+    }
+  }
+  return 0;
+}
+
+int array_cmp_float(const char *const info, float *p_exp, float *p_got, int count, float delta) {
+  if (delta == 0.0f) {
+    for (int idx = 0; idx < count; idx++) {
+      if (p_exp[idx] != p_got[idx]) {
+        printf("%s error at index %d exp %.20f got %.20f\n", info, idx, p_exp[idx], p_got[idx]);
+        if (isnan(p_exp[idx]) && isnan(p_got[idx])) {
+          printf("both exp and got are NAN\n");
+          return 0;
+        }
+        return -1;
+      }
+    }
+  } else {
+    return array_cmp_float_rel(info, p_exp, p_got, count, delta);
+  }
+  return 0;
+}
+
+int array_cmp_int(const char *const info, int *p_exp, int *p_got, int count) {
+  int idx;
+  for (idx = 0; idx < count; idx++) {
+    if (p_exp[idx] != p_got[idx]) {
+      printf("%s error at index %d exp %d got %d\n", info, idx, p_exp[idx], p_got[idx]);
+      return -1;
+    }
+  }
+  return 0;
+}
+
+int array_cmp_int8(const char *const info, const int8_t *p_exp, const int8_t *p_got, int count) {
+  int idx;
+  for (idx = 0; idx < count; idx++) {
+    if (p_exp[idx] != p_got[idx]) {
+      printf("%s error at index %d exp %d got %d\n", info, idx, p_exp[idx], p_got[idx]);
+      return -1;
+    }
+  }
+  return 0;
+}
+
+int calc_dilute_hw(int h, int ins_h, int ins_h_l, int pad_h_b, int pad_h_t) {
+  return (h - 1) * (ins_h + 1) + ins_h_l + 1 + pad_h_t + pad_h_b;
+}
+
+int calc_output_hw(int hw, int khw, int stride) { return (hw - khw) / stride + 1; }
+
+int fill_pad_fmap_int8(const int8_t *before, int8_t **pafter, int val, int pad_l, int pad_r,
+                       int pad_t, int pad_b, int ins_h, int ins_w, int ins_h_last, int ins_w_last,
+                       int h_before, int w_before) {
+  int w_after = (w_before - 1) * (ins_w + 1) + ins_w_last + 1 + pad_l + pad_r;
+  int h_after = (h_before - 1) * (ins_h + 1) + ins_h_last + 1 + pad_t + pad_b;
+  int8_t *after = *pafter;
+  if (!before || !pafter) return BM_ERR_INVALID_ARGUMENT;
+
+  if (!after) {
+    after = malloc(sizeof(int8_t) * w_after * h_after);
+    if (!after) return BM_ERR_NOMEM;
+  }
+
+  memset(after, val, w_after * h_after);
+  for (int h = 0; h < h_before; h++) {
+    for (int w = 0; w < w_before; w++) {
+      int i = (h * (ins_h + 1) + pad_t) * w_after + w * (ins_w + 1) + pad_l;
+      after[i] = before[h * w_before + w];
+    }
+  }
+
+  *pafter = after;
+  return BM_SUCCESS;
+}
+
+int fill_pad_fmap_bf16(const uint16_t *before, uint16_t **pafter, int val, int pad_l, int pad_r,
+                       int pad_t, int pad_b, int ins_h, int ins_w, int ins_h_last, int ins_w_last,
+                       int h_before, int w_before) {
+  int w_after = (w_before - 1) * (ins_w + 1) + ins_w_last + 1 + pad_l + pad_r;
+  int h_after = (h_before - 1) * (ins_h + 1) + ins_h_last + 1 + pad_t + pad_b;
+  uint16_t *after = *pafter;
+  if (!before || !pafter) return BM_ERR_INVALID_ARGUMENT;
+  if (!after) {
+    after = malloc(sizeof(uint16_t) * w_after * h_after);
+    if (!after) return BM_ERR_NOMEM;
+  }
+  for (int i = 0; i < w_after * h_after; i++) after[i] = val;
+
+  for (int h = 0; h < h_before; h++) {
+    for (int w = 0; w < w_before; w++) {
+      int i = (h * (ins_h + 1) + pad_t) * w_after + w * (ins_w + 1) + pad_l;
+      after[i] = before[h * w_before + w];
+    }
+  }
+#if 0
+  printf("bf16 padding:\n");
+  for(int i=0;i<h_after;i++) {
+    printf("[\n");
+    for(int j=0;j<w_after;j++)
+      printf("%04x ", (after[i*w_after+j]));
+    printf("\n");
+  }
+ printf("]\n");
+#endif
+  *pafter = after;
+  return BM_SUCCESS;
+}
+
+void fill_int_with_int8(int *pdest, int8_t *psrc, int len) {
+  for (int ii = 0; ii < len; ii++) pdest[ii] = (int)psrc[ii];
+}
+
+void fill_int_with_uint8(int *pdest, uint8_t *psrc, int len) {
+  for (int ii = 0; ii < len; ii++) pdest[ii] = psrc[ii];
+}
+
+void fill_int_with_int16(int *pdest, int16_t *psrc, int len) {
+  for (int ii = 0; ii < len; ii++) {
+    pdest[ii] = (int16_t)psrc[ii];
+  }
+}
+
+void inner_product(const int *a, const int *b, int len, int *c) {
+  *c = 0;
+  for (int ii = 0; ii < len; ii++) {
+    *c += (a[ii] * b[ii]);
+  }
+}
+
+void inner_float_product(const float *a, const float *b, int len, float *c) {
+  *c = 0;
+  for (int ii = 0; ii < len; ii++) {
+    *c += (a[ii] * b[ii]);
+  }
+}
+
+int fill_pad_fmap_fp32(const float *before, float **after, float pad_value, int pad_h_t,
+                       int pad_h_b, int pad_w_l, int pad_w_r, int ins_h, int ins_w, int ins_h_l,
+                       int ins_w_l, int h, int w) {
+  int h_after = calc_dilute_hw(h, ins_h, ins_h_l, pad_h_b, pad_h_t);
+  int w_after = calc_dilute_hw(w, ins_w, ins_w_l, pad_w_l, pad_w_r);
+  float *ofmap = NULL;
+
+  if (before == NULL || after == NULL) {
+    return BM_ERR_INVALID_ARGUMENT;
+  }
+  if (*after == NULL && (*after = malloc(sizeof(float) * h_after * w_after)) == NULL) {
+    printf("No enough memory: [h_after, w_after]=[%i, %i].\n", h_after, w_after);
+    return BM_ERR_NOMEM;
+  }
+
+  ofmap = *after;
+  for (int i = 0; i < h_after * w_after; i++) {
+    ofmap[i] = pad_value;
+  }
+  for (int i = 0; i < h; i++) {
+    float *start_addr = ofmap + (pad_h_t + i * (ins_h + 1)) * w_after + pad_w_l;
+    int ins_h_count = (i == h - 1) ? ins_h_l : ins_h;
+
+    for (int j = 0; j < ins_h_count + 1; j++) {
+      memset(start_addr + j * w_after, 0, sizeof(float) * (w_after - pad_w_l - pad_w_r));
+    }
+    for (int j = 0; j < w; j++) {
+      start_addr[j * (ins_w + 1)] = before[i * w + j];
+    }
+  }
+
+  return BM_SUCCESS;
+}
+
+void native_md_scalar(float *a, float *b, float *r, int N, int C, int H, int W, int op,
+                      bool result_add) {
+  int count = N * C * H * W;
+  for (int i = 0; i < count; i++) {
+    switch (op) {
+      case BLOB_ADD:
+        r[i] = a[i] + b[i];
+        break;
+      case BLOB_SUB:
+        r[i] = a[i] - b[i];
+        break;
+      case BLOB_MUL:
+        r[i] = result_add ? r[i] : 0;
+        r[i] += a[i] * b[i];
+        break;
+      case BLOB_DIV:
+        r[i] = a[i] / b[i];
+        break;
+      default:
+        assert(0);
+        break;
+    }
+  }
+}
+
+void native_md_scalar_int8(int8_t *a, int8_t *b, int8_t *r, int N, int C, int H, int W, int op,
+                           bool result_add) {
+  int count = N * C * H * W;
+  for (int i = 0; i < count; i++) {
+    switch (op) {
+      case BLOB_ADD:
+        r[i] = a[i] + b[i];
+        break;
+      case BLOB_SUB:
+        r[i] = a[i] - b[i];
+        break;
+      case BLOB_MUL:
+        r[i] = result_add ? r[i] : 0;
+        r[i] += a[i] * b[i];
+        break;
+      case BLOB_DIV:
+        r[i] = a[i] / b[i];
+        break;
+      default:
+        assert(0);
+        break;
+    }
+  }
+}
+
+static int matrix_dot_mult(int8_t *A, int8_t *B, int dim_n, int dim_m, int opd0_sign) {
+  int sum = 0;
+  for (int i = 0; i < dim_n; i++) {
+    for (int j = 0; j < dim_m; j++) {
+      int index = index_get(i, dim_m, j);
+      if (opd0_sign) {
+        sum += A[index] * B[index];
+      } else {
+        sum += (int)((uint8_t)A[index]) * B[index];
+      }
+    }
+  }
+  return sum;
+}
+
+int native_conv_int8(const int8_t *ifmap, const int8_t *weight, const int16_t *bias, int8_t *ofmap,
+                     int in, int ic, int ih, int iw, int oc, int kh, int kw, int dh, int dw,
+                     int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h, int stride_w,
+                     int ins_h, int ins_w, int ins_h_last, int ins_w_last, int input_sign,
+                     int r_shift_width, int do_relu) {
+  int ih_ext = calc_dilute_hw(ih, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int iw_ext = calc_dilute_hw(iw, ins_w, ins_w_last, pad_w_l, pad_w_r);
+  int kh_ext = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int kw_ext = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int oh = calc_output_hw(ih_ext, kh_ext, stride_h);
+  int ow = calc_output_hw(iw_ext, kw_ext, stride_w);
+
+  int *result = (int *)malloc(sizeof(int) * in * oc * oh * ow);
+  memset(result, 0, sizeof(int) * in * oc * oh * ow);
+  int8_t *i_fmap_pad_ker = (int8_t *)malloc(kh_ext * kw_ext);
+
+  int ret = BM_SUCCESS;
+
+  int8_t *i_fmap_pad = NULL;
+  int8_t *kernel_after = NULL;
+  for (int n = 0; n < in; ++n) {
+    for (int c = 0; c < oc; ++c) {
+      for (int cc = 0; cc < ic; ++cc) {
+        fill_pad_fmap_int8((int8_t *)ifmap + n * ic * ih * iw + cc * ih * iw, &i_fmap_pad, 0,
+                           pad_w_l, pad_w_r, pad_h_t, pad_h_b, ins_h, ins_w, ins_h_last, ins_w_last,
+                           ih, iw);
+
+        // kernel_dilation(
+        fill_pad_fmap_int8((weight + c * ic * kh * kw + cc * kh * kw), &kernel_after, 0, 0, 0, 0,
+                           0,  // no padding
+                           dh - 1, dw - 1, 0, 0, kh, kw);
+
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            for (int idxh = 0; idxh < kh_ext; ++idxh)
+              for (int idxw = 0; idxw < kw_ext; ++idxw) {
+                i_fmap_pad_ker[idxh * kw_ext + idxw] =
+                    i_fmap_pad[(idxh + ph * stride_h) * iw_ext + idxw + pw * stride_w];
+              }
+            result[n * oc * oh * ow + c * oh * ow + ph * ow + pw] +=
+                matrix_dot_mult(i_fmap_pad_ker, kernel_after, kh_ext, kw_ext, input_sign);
+          }
+        }
+      }
+
+      if (bias) {
+        for (int ph = 0; ph < oh; ++ph) {
+          for (int pw = 0; pw < ow; ++pw) {
+            result[n * oc * oh * ow + c * oh * ow + ph * ow + pw] += bias[c];  // bias+c ;
+          }
+        }
+      }
+
+      ret = satu_2_8bit(&result[n * oc * oh * ow + c * oh * ow], oh * ow,
+                        &ofmap[n * oc * oh * ow + c * oh * ow], r_shift_width, 1, !do_relu);
+
+      if (ret != BM_SUCCESS) goto error_release;
+    }  // end for (int c = 0; c < oc; ++c)
+  }    // end for (int n = 0; n < in; n++)
+
+error_release:
+  free(i_fmap_pad);
+  free(kernel_after);
+  free(i_fmap_pad_ker);
+  free(result);
+
+  return ret;
+}
+
+int native_depthwise_fp32(const float *ifmap, const float *weight, const float *bias, float *ofmap,
+                          int in, int ic, int ih, int iw, int kh, int kw, int dh, int dw,
+                          int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h,
+                          int stride_w, int ins_h, int ins_w, int ins_h_last, int ins_w_last) {
+  int h_after = calc_dilute_hw(ih, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(iw, ins_w, ins_w_last, pad_w_l, pad_w_r);
+  int kh_dilation = (kh - 1) * dh + 1, kw_dilatoin = (kw - 1) * dw + 1;
+  int oh = calc_output_hw(h_after, kh_dilation, stride_h);
+  int ow = calc_output_hw(w_after, kw_dilatoin, stride_w);
+  float *ifmap_after = malloc(sizeof(float) * h_after * w_after);
+  float *weight_dilation = malloc(sizeof(float) * kh_dilation * kw_dilatoin);
+
+  if (ifmap_after == NULL || weight_dilation == NULL) {
+    printf("No enough memory.\n");
+    free(ifmap_after);
+    free(weight_dilation);
+
+    return BM_ERR_NOMEM;
+  }
+
+  for (int n = 0; n < in; n++) {
+    for (int c = 0; c < ic; c++, ifmap += ih * iw, ofmap += oh * ow) {
+      float init_value = bias ? bias[c] : 0;
+      int ret_ifmap = fill_pad_fmap_fp32(ifmap, &ifmap_after, 0, pad_h_t, pad_h_b, pad_w_l, pad_w_r,
+                                         ins_h, ins_w, ins_h_last, ins_w_last, ih, iw);
+      int ret_weight = fill_pad_fmap_fp32(weight + c * kh * kw, &weight_dilation, 0, 0, 0, 0, 0,
+                                          dh - 1, dw - 1, 0, 0, kh, kw);
+
+      if ((ret_ifmap != BM_SUCCESS) || (ret_weight != BM_SUCCESS)) {
+        printf("failed to pad ifmap or weight.\n");
+        return BM_ERR_FAILURE;
+      }
+
+      for (int h = 0; h < oh; h++) {
+        for (int w = 0; w < ow; w++) {
+          int rf_h = h * stride_h, rf_w = w * stride_w;
+          int kh_end = math_min(kh_dilation, h_after - rf_h);
+          int kw_end = math_min(kw_dilatoin, w_after - rf_w);
+          float *rf_addr = ifmap_after + rf_h * w_after + rf_w;
+          float dot_product_even = 0.0, dot_product_odd = 0.0;
+
+          for (int i = 0; i < kh_end; i++) {
+            for (int j = 0; j < kw_end; j++) {
+              if ((i * kw_end + j) % 2) {
+                dot_product_odd += rf_addr[i * w_after + j] * weight_dilation[i * kw_dilatoin + j];
+              } else {
+                dot_product_even += rf_addr[i * w_after + j] * weight_dilation[i * kw_dilatoin + j];
+              }
+            }
+          }
+          ofmap[h * ow + w] = dot_product_even + dot_product_odd + init_value;
+        }
+      }
+    }
+  }
+
+  free(ifmap_after);
+  free(weight_dilation);
+
+  return BM_SUCCESS;
+}
+
+void native_conv_ref(const void *ifmap, void *ofmap, const void *weight, int input_n, int input_c,
+                     int input_h, int input_w, int output_c, int output_h, int output_w, int groups,
+                     int kh, int kw, int dilation_h, int dilation_w, int pad_h, int pad_w,
+                     int stride_h, int stride_w, int flip, int using_bias, const void *bias,
+                     int result_add) {
+  int kh_extent = dilation_h * (kh - 1) + 1;
+  int kw_extent = dilation_w * (kw - 1) + 1;
+  int output_h_expect = (input_h + 2 * pad_h - kh_extent) / stride_h + 1;
+  int output_w_expect = (input_w + 2 * pad_w - kw_extent) / stride_w + 1;
+  (void)output_h_expect;
+  (void)output_w_expect;
+  assert(output_h == output_h_expect && "Expect same output_h");
+  assert(output_w == output_w_expect && "Expect same output_w");
+
+  if (!result_add) {
+    memset(ofmap, 0, input_n * output_c * output_h * output_w * sizeof(float));
+  }
+
+  float *ifmap_f = (float *)ifmap;
+  float *ofmap_f = (float *)ofmap;
+  float *weight_f = (float *)weight;
+  float *bias_f = (float *)bias;
+  int i_shape[4];
+  i_shape[0] = input_n;
+  i_shape[1] = input_c;
+  i_shape[2] = input_h;
+  i_shape[3] = input_w;
+  int o_shape[4];
+  o_shape[0] = input_n;
+  o_shape[1] = output_c;
+  o_shape[2] = output_h;
+  o_shape[3] = output_w;
+  int k_shape[4];
+  k_shape[0] = output_c;
+  k_shape[1] = input_c / groups;
+  k_shape[2] = kh;
+  k_shape[3] = kw;
+
+  int o_g = output_c / groups;
+  int k_g = input_c / groups;
+  int o_head, k_head;
+  int weight_offset[4];
+  int in_offset[4];
+  int out_offset[4];
+
+  for (int n = 0; n < input_n; n++) {
+    for (int g = 0; g < groups; g++) {
+      o_head = o_g * g;
+      k_head = k_g * g;
+      for (int o = 0; o < o_g; o++) {
+        for (int y = 0; y < output_h; y++) {
+          for (int x = 0; x < output_w; x++) {
+            out_offset[0] = n;
+            out_offset[1] = o + o_head;
+            out_offset[2] = y;
+            out_offset[3] = x;
+            float result_init = ofmap_f[calc_offset(o_shape, out_offset)];
+            ofmap_f[calc_offset(o_shape, out_offset)] = 0.0f;
+            for (int k = 0; k < k_g; k++) {
+              for (int p = 0; p < kh; p++) {
+                for (int q = 0; q < kw; q++) {
+                  int in_y = y * stride_h - pad_h + p * dilation_h;
+                  int in_x = x * stride_w - pad_w + q * dilation_w;
+                  if (in_y >= 0 && in_y < input_h && in_x >= 0 && in_x < input_w) {
+                    weight_offset[0] = o + o_head;
+                    weight_offset[1] = k;
+                    if (flip) {
+                      weight_offset[2] = (kh - 1 - p);
+                      weight_offset[3] = (kw - 1 - q);
+                    } else {
+                      weight_offset[2] = p;
+                      weight_offset[3] = q;
+                    }
+                    in_offset[0] = n;
+                    in_offset[1] = k + k_head;
+                    in_offset[2] = in_y;
+                    in_offset[3] = in_x;
+                    ofmap_f[calc_offset(o_shape, out_offset)] +=
+                        ifmap_f[calc_offset(i_shape, in_offset)] *
+                        weight_f[calc_offset(k_shape, weight_offset)];
+                    if (k_g == 1 && kh == 1 && kw == 1) {
+                      ofmap_f[calc_offset(o_shape, out_offset)] =
+                          ifmap_f[calc_offset(i_shape, in_offset)] *
+                          weight_f[calc_offset(k_shape, weight_offset)];
+                    }
+                  }
+                }
+              }
+            }
+            if (using_bias) {
+              ofmap_f[calc_offset(o_shape, out_offset)] += bias_f[o + o_head];
+            }
+            if (result_add) {
+              ofmap_f[calc_offset(o_shape, out_offset)] += result_init;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+int native_fc_int8(const int8_t *L, const int8_t *R, const int16_t *B, const int16_t *Y, int *Y_ref,
+                   int L_row_num, int L_col_num, int R_col_num, int L_sign, int R_sign, int B_sign,
+                   int l_shift_width, int r_shift_width, int is_result_int8, int do_relu) {
+  const uint8_t *uL = (const uint8_t *)L;
+  const uint8_t *uR = (const uint8_t *)R;
+  const uint16_t *uB = (const uint16_t *)B;
+
+  int opd0, opd1, opd2;
+  int ret = BM_SUCCESS;
+
+  for (int hidx = 0; hidx < L_row_num; hidx++) {
+    for (int widx = 0; widx < R_col_num; widx++) {
+      int Y1 = 0;
+      int Y2 = 0;
+      int sum_idx = 0;
+      for (sum_idx = 0; sum_idx < L_col_num; sum_idx++) {
+        int idx_L = index_get(hidx, L_col_num, sum_idx);
+        int idx_R = index_get(sum_idx, R_col_num, widx);
+        opd0 = (L_sign) ? L[idx_L] : uL[idx_L];
+        opd1 = (R_sign) ? R[idx_R] : uR[idx_R];
+        if ((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1) {
+          Y1 += opd0 * opd1;
+        } else {
+          Y2 += opd0 * opd1;
+        }
+      }
+      sum_idx++;
+
+      if (B) {
+        opd2 = (B_sign) ? (int)B[widx] : (int)uB[widx];
+        if ((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1) {
+          Y1 += opd2;
+        } else {
+          Y2 += opd2;
+        }
+        sum_idx++;
+      }
+
+      int idx_Y = index_get(hidx, R_col_num, widx);
+      if (Y) {
+        if ((sum_idx % 2 == 0 && sum_idx != 2) || sum_idx == 1) {
+          Y1 += (Y[idx_Y] << l_shift_width);
+        } else {
+          Y2 += (Y[idx_Y] << l_shift_width);
+        }
+      }
+
+      Y_ref[idx_Y] = Y1 + Y2;
+    }
+  }
+  uint8_t *Yout_int8 = malloc(sizeof(int8_t) * L_row_num * R_col_num);
+  uint16_t *Yout_int16 = malloc(sizeof(int16_t) * L_row_num * R_col_num);
+
+  if (is_result_int8) {
+    ret =
+        satu_2_8bit(Y_ref, L_row_num * R_col_num, (int8_t *)Yout_int8, r_shift_width, 1, !do_relu);
+    if (ret != BM_SUCCESS) goto error_release;
+
+    fill_int_with_int8(Y_ref, (int8_t *)Yout_int8, L_row_num * R_col_num);
+  } else {
+    ret = satu_2_16bit(Y_ref, L_row_num * R_col_num, (int16_t *)Yout_int16, r_shift_width, 1,
+                       !do_relu);
+    if (ret != BM_SUCCESS) goto error_release;
+
+    fill_int_with_int16(Y_ref, (int16_t *)Yout_int16, L_row_num * R_col_num);
+  }
+
+error_release:
+  free(Yout_int8);
+  free(Yout_int16);
+
+  return ret;
+}
+
+int native_pooling_ave_int8(const int8_t *i_fmap, const void *weight, const int16_t *bias,
+                            int8_t *o_fmap, int input_n, int input_c, int input_h, int input_w,
+                            int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+                            int stride_h, int stride_w, int ins_h, int ins_w, int ins_h_last,
+                            int ins_w_last, int input_sign, int satu_sign, int r_shift_width,
+                            int const_weight) {
+  if (kh * kw <= 0) return BM_ERR_INVALID_ARGUMENT;
+
+  int *avg_pooling_mac_a = (int *)malloc(kh * kw * sizeof(int));
+  int *avg_pooling_mac_b = (int *)malloc(kh * kw * sizeof(int));
+
+  uint8_t avg_const_weight = *(uint8_t *)weight;
+  const int8_t *weight_arr = weight;
+
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+
+  int8_t *i_fmap_pad = NULL;
+  for (int n = 0; n < input_n; n++) {
+    if (const_weight == 0) weight_arr = weight;
+
+    for (int c = 0; c < input_c; ++c) {
+      fill_pad_fmap_int8(i_fmap, &i_fmap_pad, 0, pad_w_l, pad_w_r, pad_h_t, pad_h_b, ins_h, ins_w,
+                         ins_h_last, ins_w_last, input_h, input_w);
+      for (int ph = 0; ph < output_h; ++ph) {
+        for (int pw = 0; pw < output_w; ++pw) {
+          int hstart = ph * stride_h;
+          int wstart = pw * stride_w;
+          int pool_index = index_get(ph, output_w, pw);
+          int mac_index = 0;
+          int avg_pool_result;
+
+          for (int h = 0; h < kh; h++) {
+            for (int w = 0; w < kw; w++) {
+              int index = index_get((hstart + h), w_after, (w + wstart));
+              mac_index = index_get(h, kw, w);
+              avg_pooling_mac_a[mac_index] =
+                  input_sign ? i_fmap_pad[index] : (uint8_t)(i_fmap_pad[index]);
+
+              avg_pooling_mac_b[mac_index] =
+                  const_weight ? avg_const_weight : weight_arr[mac_index];
+            }
+          }
+
+          inner_product(avg_pooling_mac_a, avg_pooling_mac_b, kh * kw, &avg_pool_result);
+
+          if (bias) {
+            avg_pool_result += bias[c];
+          }
+
+          int ret = satu_2_8bit(&avg_pool_result, sizeof(int8_t), o_fmap + pool_index,
+                                r_shift_width, 1, satu_sign);
+
+          if (ret != BM_SUCCESS) {
+            free(i_fmap_pad);
+            free(avg_pooling_mac_a);
+            free(avg_pooling_mac_b);
+
+            return BM_ERR_INVALID_ARGUMENT;
+          }
+        }
+      }
+      i_fmap += input_w * input_h;
+      if (const_weight == 0) weight_arr += kh * kw;
+
+      o_fmap += output_w * output_h;
+    }
+  }
+  free(i_fmap_pad);
+
+  free(avg_pooling_mac_a);
+  free(avg_pooling_mac_b);
+
+  return BM_SUCCESS;
+}
+
+int native_pooling_max_int8(const int8_t *i_fmap, int8_t *o_fmap, int input_n, int input_c,
+                            int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b,
+                            int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h,
+                            int ins_w, int ins_h_last, int ins_w_last, int input_sign) {
+  if (ins_h != 0 || ins_w != 0 || ins_h_last != 0 || ins_w_last != 0)
+    return BM_ERR_INVALID_ARGUMENT;
+
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+
+  const int max_init = input_sign ? -128 : 0;
+  int8_t *i_fmap_pad = NULL;
+  for (int nc = 0; nc < input_n * input_c; nc++) {
+    fill_pad_fmap_int8(i_fmap, &i_fmap_pad, max_init, pad_w_l, pad_w_r, pad_h_t, pad_h_b, 0, 0, 0,
+                       0, input_h, input_w);
+
+    for (int ph = 0; ph < output_h; ++ph) {
+      for (int pw = 0; pw < output_w; ++pw) {
+        int hstart = ph * stride_h;
+        int wstart = pw * stride_w;
+        int pool_index = index_get(ph, output_w, pw);
+        int max = max_init;
+        for (int h = 0; h < kh; h++) {
+          for (int w = 0; w < kw; w++) {
+            int index = index_get((hstart + h), (input_w + pad_w_l + pad_w_r), (w + wstart));
+            int val = input_sign ? i_fmap_pad[index] : (uint8_t)i_fmap_pad[index];
+            max = (val > max) ? val : max;
+          }
+        }
+        o_fmap[pool_index] = max;
+      }
+    }
+    i_fmap += input_w * input_h;
+    o_fmap += output_w * output_h;
+  }
+  free(i_fmap_pad);
+
+  return BM_SUCCESS;
+}
+
+int native_pooling_max_fp32(const float *ifmap, float *ofmap, int input_n, int input_c, int input_h,
+                            int input_w, int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l,
+                            int pad_w_r, int stride_h, int stride_w, int ins_h, int ins_w,
+                            int ins_h_last, int ins_w_last) {
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+  float *ifmap_after = malloc(sizeof(float) * h_after * w_after);
+
+  if (ifmap_after == NULL) {
+    printf("No enough memory[h_after, w_after]: [%u, %u].\n", h_after, w_after);
+    return BM_ERR_NOMEM;
+  }
+
+  for (int n = 0; n < input_n; n++) {
+    for (int c = 0; c < input_c; c++) {
+      int ret = fill_pad_fmap_fp32(ifmap, &ifmap_after, -FLT_MAX, pad_h_t, pad_h_b, pad_w_l,
+                                   pad_w_r, ins_h, ins_w, ins_h_last, ins_w_last, input_h, input_w);
+
+      if (ret != BM_SUCCESS) {
+        printf("Failed to pad input fmap.\n");
+        free(ifmap_after);
+        return BM_ERR_FAILURE;
+      }
+
+      for (int h = 0; h < output_h; h++) {
+        for (int w = 0; w < output_w; w++) {
+          int rf_h = h * stride_h, rf_w = w * stride_w;
+          int kh_end = math_min(kh, h_after - rf_h);
+          int kw_end = math_min(kw, w_after - rf_w);
+          float *rf_addr = ifmap_after + rf_h * w_after + rf_w;
+          float max_val = -FLT_MAX;
+
+          for (int i = 0; i < kh_end; i++) {
+            for (int j = 0; j < kw_end; j++) {
+              max_val = math_max(rf_addr[i * w_after + j], max_val);
+            }
+          }
+          ofmap[h * output_w + w] = max_val;
+        }
+      }
+
+      ifmap += input_h * input_w;
+      ofmap += output_h * output_w;
+    }
+  }
+
+  free(ifmap_after);
+  return BM_SUCCESS;
+}
+
+int native_pooling_avg_fp32(const float *ifmap, float *ofmap, int input_n, int input_c, int input_h,
+                            int input_w, int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l,
+                            int pad_w_r, int stride_h, int stride_w, int ins_h, int ins_w,
+                            int ins_h_last, int ins_w_last, float avg_pooling_const) {
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+  int output_h = calc_output_hw(h_after, kh, stride_h);
+  int output_w = calc_output_hw(w_after, kw, stride_w);
+  float *ifmap_after = malloc(sizeof(float) * h_after * w_after);
+
+  if (ifmap_after == NULL) {
+    printf("No enough memory[h_after, w_after]: [%u, %u].\n", h_after, w_after);
+    return BM_ERR_NOMEM;
+  }
+
+  for (int n = 0; n < input_n; n++) {
+    for (int c = 0; c < input_c; c++) {
+      int ret = fill_pad_fmap_fp32(ifmap, &ifmap_after, 0, pad_h_t, pad_h_b, pad_w_l, pad_w_r,
+                                   ins_h, ins_w, ins_h_last, ins_w_last, input_h, input_w);
+
+      if (ret != BM_SUCCESS) {
+        printf("Failed to pad input fmap.\n");
+        free(ifmap_after);
+        return BM_ERR_FAILURE;
+      }
+
+      for (int h = 0; h < output_h; h++) {
+        for (int w = 0; w < output_w; w++) {
+          int rf_h = h * stride_h, rf_w = w * stride_w;
+          int kh_end = math_min(kh, h_after - rf_h);
+          int kw_end = math_min(kw, w_after - rf_w);
+          float *rf_addr = ifmap_after + rf_h * w_after + rf_w;
+          float dot_product_even = 0.0, dot_product_odd = 0.0;
+
+          for (int i = 0; i < kh_end; i++) {
+            for (int j = 0; j < kw_end; j++) {
+              if ((i * kw_end + j) % 2) {
+                dot_product_odd += rf_addr[i * w_after + j] * avg_pooling_const;
+              } else {
+                dot_product_even += rf_addr[i * w_after + j] * avg_pooling_const;
+              }
+            }
+          }
+          ofmap[h * output_w + w] = dot_product_even + dot_product_odd;
+        }
+      }
+
+      ifmap += input_h * input_w;
+      ofmap += output_h * output_w;
+    }
+  }
+
+  free(ifmap_after);
+  return BM_SUCCESS;
+}
+
+void native_pooling_forward_max(const float *bottom_data, float *top_data, int *mask_data,
+                                const int count, const int num, const int channels,
+                                const int height, const int width, const int pooled_height,
+                                const int pooled_width, const int kernel_h, const int kernel_w,
+                                const int stride_h, const int stride_w, const int pad_h,
+                                const int pad_w) {
+  (void)num;
+  for (int index = 0; index < count; ++index) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    const int hend = math_min(hstart + kernel_h, height);
+    const int wend = math_min(wstart + kernel_w, width);
+    hstart = math_max(hstart, 0);
+    wstart = math_max(wstart, 0);
+    float maxval = -FLT_MAX;
+    int maxidx = -1;
+    const float *const bottom_slice = bottom_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        if (bottom_slice[h * width + w] > maxval) {
+          maxidx = h * width + w;
+          maxval = bottom_slice[maxidx];
+        }
+      }
+    }
+    top_data[index] = maxval;
+    mask_data[index] = maxidx;
+  }
+}
+
+void native_pooling_forward_ave(const float *bottom_data, float *top_data, const int count,
+                                const int num, const int channels, const int height,
+                                const int width, const int pooled_height, const int pooled_width,
+                                const int kernel_h, const int kernel_w, const int stride_h,
+                                const int stride_w, const int pad_h, const int pad_w) {
+  (void)num;
+  for (int index = 0; index < count; ++index) {
+    const int pw = index % pooled_width;
+    const int ph = (index / pooled_width) % pooled_height;
+    const int c = (index / pooled_width / pooled_height) % channels;
+    const int n = index / pooled_width / pooled_height / channels;
+    int hstart = ph * stride_h - pad_h;
+    int wstart = pw * stride_w - pad_w;
+    int hend = math_min(hstart + kernel_h, height + pad_h);
+    int wend = math_min(wstart + kernel_w, width + pad_w);
+    const int pool_size = (hend - hstart) * (wend - wstart);
+    hstart = math_max(hstart, 0);
+    wstart = math_max(wstart, 0);
+    hend = math_min(hend, height);
+    wend = math_min(wend, width);
+    float aveval = 0;
+    const float *const bottom_slice = bottom_data + (n * channels + c) * height * width;
+    for (int h = hstart; h < hend; ++h) {
+      for (int w = wstart; w < wend; ++w) {
+        aveval += bottom_slice[h * width + w];
+      }
+    }
+    top_data[index] = aveval / pool_size;
+  }
+}
+
+int satu_2_8bit(const int *pBuff, int len, int8_t *pByteOut, int rshiftbits, int round_floor,
+                int sign_unsign) {
+  if (rshiftbits < 0) return BM_ERR_INVALID_ARGUMENT;
+
+  int temp;
+  int satu_max = sign_unsign ? 127 : 255;
+  int satu_min = sign_unsign ? -128 : 0;
+  if (rshiftbits == 0) {
+    for (int ii = 0; ii < len; ii++) {
+      temp = (pBuff[ii] > satu_max) ? satu_max : ((pBuff[ii] < satu_min) ? satu_min : pBuff[ii]);
+      memcpy(pByteOut + ii, &temp, 1);
+    }
+  } else {  // rshiftbits>0
+    for (int ii = 0; ii < len; ii++) {
+      if (round_floor == 1)
+        temp = ((pBuff[ii] >> (rshiftbits - 1)) + 1) >> 1;
+      else
+        temp = pBuff[ii] >> rshiftbits;
+      temp = (temp > satu_max) ? satu_max : ((temp < satu_min) ? satu_min : temp);
+      memcpy(pByteOut + ii, &temp, 1);
+    }
+  }
+
+  return BM_SUCCESS;
+}
+
+int satu_2_16bit(const int *pBuff, int len, short *pByteOut, int rshiftbits, int round_floor,
+                 int sign_unsign) {
+  if (rshiftbits < 0) return BM_ERR_INVALID_ARGUMENT;
+
+  int ii;
+  int temp;
+  int satu_max = sign_unsign ? 32767 : 65535;
+  int satu_min = sign_unsign ? -32768 : 0;
+  if (rshiftbits == 0) {
+    for (ii = 0; ii < len; ii++) {
+      temp = (pBuff[ii] > satu_max) ? satu_max : ((pBuff[ii] < satu_min) ? satu_min : pBuff[ii]);
+      memcpy(pByteOut + ii, &temp, 2);
+    }
+  } else {  // rshiftbits>0
+    for (ii = 0; ii < len; ii++) {
+      if (round_floor == 1)
+        temp = ((pBuff[ii] >> (rshiftbits - 1)) + 1) >> 1;
+      else
+        temp = pBuff[ii] >> rshiftbits;
+      temp = (temp > satu_max) ? satu_max : ((temp < satu_min) ? satu_min : temp);
+      memcpy(pByteOut + ii, &temp, 2);
+    }
+  }
+
+  return BM_SUCCESS;
+}
diff --git a/cvimath/tests/cvi1835/atan.cpp b/cvimath/tests/cvi1835/atan.cpp
new file mode 100644
index 000000000..275c6170a
--- /dev/null
+++ b/cvimath/tests/cvi1835/atan.cpp
@@ -0,0 +1,477 @@
+/**
+ * plz refer [git](https://github.com/xiezhq-hermann/atan_lookup)
+ * input range is `all real numbers` and output range is -pi/2 < x < pi/2,
+ * you can refer [here](https://www.mathopenref.com/arctan.html) for more
+ * details
+ */
+//
+// xiezhq@shanghaitech.edu.cn && wanghe@shanghaitech.edu.cn
+/* Reference:
+   [1] Abhisek Ukil, Vishal H Shah, Bernhard Deck,
+   "Fast Computation of arctangent Functions for Embedded Applications: A
+   Comparative Analysis" IEEE International Symposium on Industrial Electronics,
+   Pages: 1206 - 1211, DOI: 10.1109/ISIE.2011.5984330, 2011
+   [2] Sreeraman Rajan, Sichun Wang, Robert Inkol, and Alain Joyal
+   "Efficient Approximations for the Arctangent Function"
+   IEEE SIGNAL PROCESSING MAGAZINE [108] MAY 2006
+ */
+
+#include <cvimath_internal.h>
+#include <test_cvikernel_util.h>
+
+#define OUT
+#define IN
+#include <cfloat>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <random>
+#include <string>
+//#define DBG
+
+using namespace std;
+
+#if 0
+double atan_double(double x) {
+  /*
+  More precise look-up table is used for higher accuracy
+  */
+  if (x >= 0) {
+    if (x <= 1) {
+      int index = round(x * 100);
+      return (LUT_d[index] + (x * 100 - index) * (LUT_d[index + 1] - LUT_d[index]));
+    } else {
+      double re_x = 1 / x;
+      int index = round(re_x * 100);
+      return (M_PI_2 - (LUT_d[index] + (re_x * 100 - index) * (LUT_d[index + 1] - LUT_d[index])));
+      // No recursive is better here
+    }
+  } else {
+    if (x >= -1) {
+      double abs_x = -x;
+      int index = round(abs_x * 100);
+      return -(LUT_d[index] + (abs_x * 100 - index) * (LUT_d[index + 1] - LUT_d[index]));
+    } else {
+      double re_x = 1 / (-x);
+      int index = round(re_x * 100);
+      return (LUT_d[index] + (re_x * 100 - index) * (LUT_d[index+1] - LUT_d[index])) - M_PI_2;
+    }
+  }
+}
+#endif
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0,  // pre-data + fix compare
+  DATA_COMPARE_ACCURACY,     // generate \range_start to \range_end value that
+                             // check epsilon
+  DATA_COMPARE_U8,           // generate \range_start to \range_end value that check
+                             // epsilon, result bf16->uint8_t
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+
+static uint16_t test_pattern[] = {
+    0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52, 0x3A6C, 0x3A83, 0x3A90,
+    0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF, 0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17,
+    0x3B1D, 0x3B24, 0x3B2A, 0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65,
+    0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90, 0x3B93, 0x3B97, 0x3B9A,
+    0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE, 0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1,
+    0x3BC5, 0x3BC8, 0x3BCB, 0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9,
+    0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03, 0x3C05, 0x3C06, 0x3C08,
+    0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12, 0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C,
+    0x3C1D, 0x3C1F, 0x3C21, 0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F,
+    0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E, 0x3C40, 0x3C41, 0x3C43,
+    0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D, 0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57,
+    0x3C58, 0x3C5A, 0x3C5C, 0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A,
+    0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79, 0x3C7B, 0x3C7C, 0x3C7E,
+    0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84, 0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89,
+    0x3C8A, 0x3C8A, 0x3C8B, 0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93,
+    0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A, 0x3C9B, 0x3C9C, 0x3C9C,
+    0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1, 0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6,
+    0x3CA7, 0x3CA8, 0x3CA9, 0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0,
+    0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8, 0x3CB8, 0x3CB9, 0x3CBA,
+    0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF, 0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4,
+    0x3CC5, 0x3CC5, 0x3CC6, 0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE,
+    0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5, 0x3CD6, 0x3CD7, 0x3CD7,
+    0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC, 0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1,
+    0x3CE2, 0x3CE3, 0x3CE4, 0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB,
+    0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2, 0x3CF3, 0x3CF4, 0x3CF5,
+    0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA, 0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF,
+    0x3D00, 0x3D00, 0x3D01, 0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04,
+    0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08, 0x3D08, 0x3D09, 0x3D09,
+    0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C, 0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E,
+    0x3D0F, 0x3D0F, 0x3D0F, 0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13,
+    0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17, 0x3D17, 0x3D18, 0x3D18,
+    0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A, 0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D,
+    0x3D1D, 0x3D1E, 0x3D1E, 0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22,
+    0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25, 0x3D26, 0x3D26, 0x3D27,
+    0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29, 0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C,
+    0x3D2C, 0x3D2C, 0x3D2D, 0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31,
+    0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34, 0x3D35, 0x3D35, 0x3D35,
+    0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38, 0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A,
+    0x3D3B, 0x3D3B, 0x3D3C, 0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F,
+    0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43, 0x3D43, 0x3D44, 0x3D44,
+    0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47, 0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49,
+    0x3D4A, 0x3D4A, 0x3D4A, 0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E,
+    0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52, 0x3D52, 0x3D53, 0x3D53,
+    0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55, 0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58,
+    0x3D58, 0x3D59, 0x3D59, 0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D,
+    0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60, 0x3D61, 0x3D61, 0x3D62,
+    0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64, 0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67,
+    0x3D67, 0x3D67, 0x3D68, 0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C,
+    0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F, 0x3D70, 0x3D70, 0x3D70,
+    0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73, 0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75,
+    0x3D76, 0x3D76, 0x3D77, 0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A,
+    0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E, 0x3D7E, 0x3D7F, 0x3D7F,
+    0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82,
+    0x3D82, 0x3D82, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85,
+    0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D87, 0x3D87, 0x3D87,
+    0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89,
+    0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C,
+    0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E,
+    0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91,
+    0x3D91, 0x3D91, 0x3D91, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93,
+    0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95, 0x3D95, 0x3D96, 0x3D96,
+    0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98,
+    0x3D98, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B,
+    0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9D, 0x3D9D, 0x3D9D,
+    0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0,
+    0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2,
+    0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4,
+    0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7,
+    0x3DA7, 0x3DA7, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9,
+    0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAC, 0x3DAC,
+    0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE,
+    0x3DAE, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1,
+    0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3,
+    0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6,
+    0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8,
+    0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBB,
+    0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD,
+    0x3DBD, 0x3DBD, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF,
+    0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC2, 0x3DC2, 0x3DC2,
+    0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4,
+    0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7,
+    0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9, 0x3DC9, 0x3DC9, 0x3DC9,
+    0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC,
+    0x3DCC, 0x3DCC, 0x3DCC, 0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5,
+    0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE, 0x3DDF, 0x3DE0, 0x3DE1,
+    0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5,
+};
+
+static uint16_t golden_bf16[] = {
+    0x0,    0x38d2, 0x3952, 0x399d, 0x39d2, 0x3a03, 0x3a1d, 0x3a38, 0x3a52, 0x3a6c, 0x3a83, 0x3a90,
+    0x3a9d, 0x3aaa, 0x3ab8, 0x3ac5, 0x3ad2, 0x3adf, 0x3aec, 0x3afa, 0x3b03, 0x3b0a, 0x3b10, 0x3b17,
+    0x3b1d, 0x3b24, 0x3b2a, 0x3b31, 0x3b38, 0x3b3e, 0x3b45, 0x3b4c, 0x3b52, 0x3b59, 0x3b5f, 0x3b65,
+    0x3b6c, 0x3b72, 0x3b7a, 0x3b80, 0x3b83, 0x3b86, 0x3b8a, 0x3b8d, 0x3b90, 0x3b93, 0x3b97, 0x3b9a,
+    0x3b9d, 0x3ba1, 0x3ba4, 0x3ba7, 0x3baa, 0x3bae, 0x3bb1, 0x3bb4, 0x3bb8, 0x3bbb, 0x3bbe, 0x3bc1,
+    0x3bc5, 0x3bc8, 0x3bcb, 0x3bce, 0x3bd2, 0x3bd6, 0x3bd8, 0x3bdc, 0x3bdf, 0x3be2, 0x3be6, 0x3be9,
+    0x3bec, 0x3bef, 0x3bf2, 0x3bf6, 0x3bf9, 0x3bfc, 0x3c00, 0x3c01, 0x3c03, 0x3c05, 0x3c06, 0x3c08,
+    0x3c0a, 0x3c0b, 0x3c0d, 0x3c0f, 0x3c10, 0x3c12, 0x3c13, 0x3c15, 0x3c17, 0x3c18, 0x3c1a, 0x3c1c,
+    0x3c1d, 0x3c1f, 0x3c21, 0x3c22, 0x3c24, 0x3c25, 0x3c27, 0x3c29, 0x3c2a, 0x3c2c, 0x3c2e, 0x3c2f,
+    0x3c31, 0x3c33, 0x3c34, 0x3c36, 0x3c38, 0x3c39, 0x3c3b, 0x3c3c, 0x3c3e, 0x3c40, 0x3c41, 0x3c43,
+    0x3c45, 0x3c46, 0x3c48, 0x3c4a, 0x3c4b, 0x3c4d, 0x3c4e, 0x3c50, 0x3c52, 0x3c53, 0x3c55, 0x3c57,
+    0x3c58, 0x3c5a, 0x3c5c, 0x3c5d, 0x3c5f, 0x3c60, 0x3c62, 0x3c64, 0x3c66, 0x3c68, 0x3c69, 0x3c6a,
+    0x3c6c, 0x3c6e, 0x3c70, 0x3c71, 0x3c72, 0x3c74, 0x3c76, 0x3c78, 0x3c79, 0x3c7b, 0x3c7c, 0x3c7e,
+    0x3c80, 0x3c81, 0x3c81, 0x3c82, 0x3c83, 0x3c84, 0x3c85, 0x3c86, 0x3c86, 0x3c87, 0x3c88, 0x3c89,
+    0x3c8a, 0x3c8a, 0x3c8b, 0x3c8c, 0x3c8d, 0x3c8e, 0x3c8f, 0x3c8f, 0x3c90, 0x3c91, 0x3c92, 0x3c93,
+    0x3c93, 0x3c94, 0x3c95, 0x3c96, 0x3c97, 0x3c98, 0x3c98, 0x3c99, 0x3c9a, 0x3c9b, 0x3c9c, 0x3c9c,
+    0x3c9d, 0x3c9e, 0x3c9f, 0x3ca0, 0x3ca1, 0x3ca1, 0x3ca2, 0x3ca3, 0x3ca4, 0x3ca5, 0x3ca5, 0x3ca6,
+    0x3ca7, 0x3ca8, 0x3ca9, 0x3caa, 0x3caa, 0x3cab, 0x3cac, 0x3cad, 0x3cae, 0x3cae, 0x3caf, 0x3cb0,
+    0x3cb1, 0x3cb2, 0x3cb3, 0x3cb3, 0x3cb4, 0x3cb5, 0x3cb6, 0x3cb7, 0x3cb8, 0x3cb8, 0x3cb9, 0x3cba,
+    0x3cbb, 0x3cbc, 0x3cbc, 0x3cbd, 0x3cbe, 0x3cbf, 0x3cc0, 0x3cc1, 0x3cc1, 0x3cc2, 0x3cc3, 0x3cc4,
+    0x3cc5, 0x3cc5, 0x3cc6, 0x3cc7, 0x3cc8, 0x3cc9, 0x3cca, 0x3cca, 0x3ccb, 0x3ccc, 0x3ccd, 0x3cce,
+    0x3cce, 0x3ccf, 0x3cd0, 0x3cd1, 0x3cd2, 0x3cd3, 0x3cd3, 0x3cd4, 0x3cd5, 0x3cd6, 0x3cd7, 0x3cd7,
+    0x3cd8, 0x3cd9, 0x3cda, 0x3cdb, 0x3cdc, 0x3cdc, 0x3cdd, 0x3cde, 0x3cdf, 0x3ce0, 0x3ce0, 0x3ce1,
+    0x3ce2, 0x3ce3, 0x3ce4, 0x3ce5, 0x3ce5, 0x3ce6, 0x3ce7, 0x3ce8, 0x3ce9, 0x3ce9, 0x3cea, 0x3ceb,
+    0x3cec, 0x3ced, 0x3cee, 0x3cee, 0x3cef, 0x3cf0, 0x3cf1, 0x3cf2, 0x3cf2, 0x3cf3, 0x3cf4, 0x3cf5,
+    0x3cf6, 0x3cf7, 0x3cf7, 0x3cf8, 0x3cf9, 0x3cfa, 0x3cfb, 0x3cfb, 0x3cfc, 0x3cfd, 0x3cfe, 0x3cff,
+    0x3d00, 0x3d00, 0x3d01, 0x3d01, 0x3d01, 0x3d02, 0x3d02, 0x3d03, 0x3d03, 0x3d03, 0x3d04, 0x3d04,
+    0x3d05, 0x3d05, 0x3d06, 0x3d06, 0x3d06, 0x3d07, 0x3d07, 0x3d08, 0x3d08, 0x3d08, 0x3d09, 0x3d09,
+    0x3d0a, 0x3d0a, 0x3d0a, 0x3d0b, 0x3d0b, 0x3d0c, 0x3d0c, 0x3d0c, 0x3d0d, 0x3d0d, 0x3d0e, 0x3d0e,
+    0x3d0f, 0x3d0f, 0x3d0f, 0x3d10, 0x3d10, 0x3d11, 0x3d11, 0x3d11, 0x3d12, 0x3d12, 0x3d13, 0x3d13,
+    0x3d13, 0x3d14, 0x3d14, 0x3d15, 0x3d15, 0x3d16, 0x3d16, 0x3d16, 0x3d17, 0x3d17, 0x3d18, 0x3d18,
+    0x3d18, 0x3d19, 0x3d19, 0x3d1a, 0x3d1a, 0x3d1a, 0x3d1b, 0x3d1b, 0x3d1c, 0x3d1c, 0x3d1c, 0x3d1d,
+    0x3d1d, 0x3d1e, 0x3d1e, 0x3d1f, 0x3d1f, 0x3d1f, 0x3d20, 0x3d20, 0x3d21, 0x3d21, 0x3d21, 0x3d22,
+    0x3d22, 0x3d23, 0x3d23, 0x3d23, 0x3d24, 0x3d24, 0x3d25, 0x3d25, 0x3d25, 0x3d26, 0x3d26, 0x3d27,
+    0x3d27, 0x3d28, 0x3d28, 0x3d28, 0x3d29, 0x3d29, 0x3d2a, 0x3d2a, 0x3d2a, 0x3d2b, 0x3d2b, 0x3d2c,
+    0x3d2c, 0x3d2c, 0x3d2d, 0x3d2d, 0x3d2e, 0x3d2e, 0x3d2e, 0x3d2f, 0x3d2f, 0x3d30, 0x3d30, 0x3d31,
+    0x3d31, 0x3d31, 0x3d32, 0x3d32, 0x3d33, 0x3d33, 0x3d33, 0x3d34, 0x3d34, 0x3d35, 0x3d35, 0x3d35,
+    0x3d36, 0x3d36, 0x3d37, 0x3d37, 0x3d38, 0x3d38, 0x3d38, 0x3d39, 0x3d39, 0x3d3a, 0x3d3a, 0x3d3a,
+    0x3d3b, 0x3d3b, 0x3d3c, 0x3d3c, 0x3d3c, 0x3d3d, 0x3d3d, 0x3d3e, 0x3d3e, 0x3d3e, 0x3d3f, 0x3d3f,
+    0x3d40, 0x3d40, 0x3d41, 0x3d41, 0x3d41, 0x3d42, 0x3d42, 0x3d43, 0x3d43, 0x3d43, 0x3d44, 0x3d44,
+    0x3d45, 0x3d45, 0x3d45, 0x3d46, 0x3d46, 0x3d47, 0x3d47, 0x3d47, 0x3d48, 0x3d48, 0x3d49, 0x3d49,
+    0x3d4a, 0x3d4a, 0x3d4a, 0x3d4b, 0x3d4b, 0x3d4c, 0x3d4c, 0x3d4c, 0x3d4d, 0x3d4d, 0x3d4e, 0x3d4e,
+    0x3d4e, 0x3d4f, 0x3d4f, 0x3d50, 0x3d50, 0x3d50, 0x3d51, 0x3d51, 0x3d52, 0x3d52, 0x3d53, 0x3d53,
+    0x3d53, 0x3d54, 0x3d54, 0x3d55, 0x3d55, 0x3d55, 0x3d56, 0x3d56, 0x3d57, 0x3d57, 0x3d57, 0x3d58,
+    0x3d58, 0x3d59, 0x3d59, 0x3d59, 0x3d5a, 0x3d5a, 0x3d5b, 0x3d5b, 0x3d5c, 0x3d5c, 0x3d5c, 0x3d5d,
+    0x3d5d, 0x3d5e, 0x3d5e, 0x3d5e, 0x3d5f, 0x3d5f, 0x3d60, 0x3d60, 0x3d60, 0x3d60, 0x3d60, 0x3d61,
+    0x3d61, 0x3d62, 0x3d62, 0x3d62, 0x3d63, 0x3d63, 0x3d64, 0x3d64, 0x3d64, 0x3d65, 0x3d65, 0x3d66,
+    0x3d66, 0x3d66, 0x3d67, 0x3d67, 0x3d68, 0x3d68, 0x3d68, 0x3d69, 0x3d69, 0x3d6a, 0x3d6a, 0x3d6b,
+    0x3d6b, 0x3d6b, 0x3d6c, 0x3d6c, 0x3d6d, 0x3d6d, 0x3d6d, 0x3d6e, 0x3d6e, 0x3d6f, 0x3d6f, 0x3d6f,
+    0x3d70, 0x3d70, 0x3d71, 0x3d71, 0x3d71, 0x3d72, 0x3d72, 0x3d73, 0x3d73, 0x3d74, 0x3d74, 0x3d74,
+    0x3d75, 0x3d75, 0x3d76, 0x3d76, 0x3d76, 0x3d77, 0x3d77, 0x3d78, 0x3d78, 0x3d78, 0x3d79, 0x3d79,
+    0x3d7a, 0x3d7a, 0x3d7a, 0x3d7b, 0x3d7b, 0x3d7c, 0x3d7c, 0x3d7d, 0x3d7d, 0x3d7d, 0x3d7e, 0x3d7e,
+    0x3d7f, 0x3d7f, 0x3d7f, 0x3d7f, 0x3d81, 0x3d81, 0x3d81, 0x3d81, 0x3d81, 0x3d82, 0x3d82, 0x3d82,
+    0x3d82, 0x3d82, 0x3d83, 0x3d83, 0x3d83, 0x3d83, 0x3d83, 0x3d84, 0x3d84, 0x3d84, 0x3d84, 0x3d85,
+    0x3d85, 0x3d85, 0x3d85, 0x3d85, 0x3d86, 0x3d86, 0x3d86, 0x3d86, 0x3d86, 0x3d87, 0x3d87, 0x3d87,
+    0x3d87, 0x3d87, 0x3d88, 0x3d88, 0x3d88, 0x3d88, 0x3d88, 0x3d89, 0x3d89, 0x3d89, 0x3d89, 0x3d89,
+    0x3d8a, 0x3d8a, 0x3d8a, 0x3d8a, 0x3d8a, 0x3d8b, 0x3d8b, 0x3d8b, 0x3d8b, 0x3d8b, 0x3d8c, 0x3d8c,
+    0x3d8c, 0x3d8c, 0x3d8c, 0x3d8d, 0x3d8d, 0x3d8d, 0x3d8d, 0x3d8e, 0x3d8e, 0x3d8e, 0x3d8e, 0x3d8e,
+    0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d90, 0x3d90, 0x3d90, 0x3d90, 0x3d90, 0x3d91, 0x3d91,
+    0x3d91, 0x3d91, 0x3d91, 0x3d92, 0x3d92, 0x3d92, 0x3d92, 0x3d92, 0x3d93, 0x3d93, 0x3d93, 0x3d93,
+    0x3d93, 0x3d94, 0x3d94, 0x3d94, 0x3d94, 0x3d94, 0x3d95, 0x3d95, 0x3d95, 0x3d95, 0x3d96, 0x3d96,
+    0x3d96, 0x3d96, 0x3d96, 0x3d97, 0x3d97, 0x3d97, 0x3d97, 0x3d97, 0x3d98, 0x3d98, 0x3d98, 0x3d98,
+    0x3d98, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d99, 0x3d9a,
+    0x3d9a, 0x3d9a, 0x3d9a, 0x3d9a, 0x3d9b, 0x3d9b, 0x3d9b, 0x3d9b, 0x3d9b, 0x3d9c, 0x3d9c, 0x3d9c,
+    0x3d9c, 0x3d9c, 0x3d9d, 0x3d9d, 0x3d9d, 0x3d9d, 0x3d9e, 0x3d9e, 0x3d9e, 0x3d9e, 0x3d9e, 0x3d9f,
+    0x3d9f, 0x3d9f, 0x3d9f, 0x3d9f, 0x3da0, 0x3da0, 0x3da0, 0x3da0, 0x3da0, 0x3da1, 0x3da1, 0x3da1,
+    0x3da1, 0x3da1, 0x3da2, 0x3da2, 0x3da2, 0x3da2, 0x3da2, 0x3da3, 0x3da3, 0x3da3, 0x3da3, 0x3da3,
+    0x3da4, 0x3da4, 0x3da4, 0x3da4, 0x3da4, 0x3da5, 0x3da5, 0x3da5, 0x3da5, 0x3da6, 0x3da6, 0x3da6,
+    0x3da6, 0x3da6, 0x3da7, 0x3da7, 0x3da7, 0x3da7, 0x3da7, 0x3da8, 0x3da8, 0x3da8, 0x3da8, 0x3da8,
+    0x3da9, 0x3da9, 0x3da9, 0x3da9, 0x3da9, 0x3daa, 0x3daa, 0x3daa, 0x3daa, 0x3daa, 0x3dab, 0x3dab,
+    0x3dab, 0x3dab, 0x3dab, 0x3dac, 0x3dac, 0x3dac, 0x3dac, 0x3dac, 0x3dad, 0x3dad, 0x3dad, 0x3dad,
+    0x3dad, 0x3daf, 0x3daf, 0x3daf, 0x3daf, 0x3db0, 0x3db0, 0x3db0, 0x3db0, 0x3db0, 0x3db1, 0x3db1,
+    0x3db1, 0x3db1, 0x3db1, 0x3db2, 0x3db2, 0x3db2, 0x3db2, 0x3db2, 0x3db3, 0x3db3, 0x3db3, 0x3db3,
+    0x3db3, 0x3db4, 0x3db4, 0x3db4, 0x3db4, 0x3db4, 0x3db5, 0x3db5, 0x3db5, 0x3db5, 0x3db5, 0x3db6,
+    0x3db6, 0x3db6, 0x3db6, 0x3db6, 0x3db7, 0x3db7, 0x3db7, 0x3db7, 0x3db8, 0x3db8, 0x3db8, 0x3db8,
+    0x3db8, 0x3db9, 0x3db9, 0x3db9, 0x3db9, 0x3db9, 0x3dba, 0x3dba, 0x3dba, 0x3dba, 0x3dba, 0x3dbb,
+    0x3dbb, 0x3dbb, 0x3dbb, 0x3dbb, 0x3dbc, 0x3dbc, 0x3dbc, 0x3dbc, 0x3dbc, 0x3dbd, 0x3dbd, 0x3dbd,
+    0x3dbd, 0x3dbd, 0x3dbe, 0x3dbe, 0x3dbe, 0x3dbe, 0x3dbe, 0x3dbf, 0x3dbf, 0x3dbf, 0x3dbf, 0x3dbf,
+    0x3dc0, 0x3dc0, 0x3dc0, 0x3dc0, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1, 0x3dc1,
+    0x3dc1, 0x3dc1, 0x3dc2, 0x3dc2, 0x3dc2, 0x3dc2, 0x3dc2, 0x3dc3, 0x3dc3, 0x3dc3, 0x3dc3, 0x3dc3,
+    0x3dc4, 0x3dc4, 0x3dc4, 0x3dc4, 0x3dc4, 0x3dc5, 0x3dc5, 0x3dc5, 0x3dc5, 0x3dc5, 0x3dc6, 0x3dc6,
+    0x3dc6, 0x3dc6, 0x3dc6, 0x3dc7, 0x3dc7, 0x3dc7, 0x3dc7, 0x3dc7, 0x3dc8, 0x3dc8, 0x3dc8, 0x3dc8,
+    0x3dc9, 0x3dc9, 0x3dc9, 0x3dc9, 0x3dc9, 0x3dca, 0x3dca, 0x3dca, 0x3dca, 0x3dca, 0x3dcb, 0x3dcb,
+    0x3dcb, 0x3dcb, 0x3dcb, 0x3dcc, 0x3dcd, 0x3dce, 0x3dcf, 0x3dd0, 0x3dd1, 0x3dd2, 0x3dd3, 0x3dd4,
+    0x3dd5, 0x3dd6, 0x3dd7, 0x3dd8, 0x3dd9, 0x3dda, 0x3ddb, 0x3ddc, 0x3ddd, 0x3dde, 0x3ddf, 0x3de0,
+    0x3de1, 0x3de2, 0x3de3, 0x3de4,
+};
+
+// <! gen atan f(x) = atan(x)
+static double _gen_atan(float i) { return atan(i); }
+
+static void tl_lut_ref(uint16_t *ofmap, uint16_t *ifmap, cvk_tl_shape_t ifmap_shape) {
+  assert(ofmap);
+
+  for (uint32_t i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    float f = convert_bf16_fp32(ifmap[i]);
+    double v = _gen_atan(f);
+    ofmap[i] = convert_fp32_bf16(v);
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      ofmap[i] = golden_bf16[i];
+    } else if (mode == DATA_COMPARE_U8) {
+      ofmap[i] = (uint8_t)convert_bf16_s8(ofmap[i]);
+    }
+  }
+}
+
+static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint16_t *ifmap, uint64_t ifmap_size,
+                   float epsilon) {
+  uint64_t size = ifmap_size;
+
+  for (uint64_t i = 0; i < size; i++) {
+    bool is_close;
+    uint16_t ref = ref_data[i];
+    uint16_t ofmap_data_bf16;
+    float ref_f;
+    float ofmap_data_f;
+
+    ref_f = convert_bf16_fp32(ref);
+    ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
+    ofmap_data_bf16 = ofmap_data[i];
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      is_close = ofmap_data[i] == ref;
+    } else {
+      is_close = fabs(ref_f - ofmap_data_f) < epsilon;
+    }
+
+    if (!is_close) {
+      float input = convert_bf16_fp32(ifmap[i]);
+      fprintf(stderr,
+              "comparing failed at ofmap_data[%lu](input:%f)\n"
+              "\tgot %x, exp %x, fp32: got %f exp %f, atan(%f) = %f\n",
+              i, input, ofmap_data_bf16, ref, ofmap_data_f, ref_f, input, _gen_atan(input));
+      exit(-1);
+    }
+  }
+
+  return true;
+}
+
+static void gen_input(uint16_t *input_data, uint64_t ifmap_size, TEST_MODE mode, int range_start,
+                      int range_end) {
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(input_data, &test_pattern, sizeof(test_pattern));
+  } else {
+    std::random_device rd;
+    std::mt19937 e2(rd());
+    std::uniform_real_distribution<> dist(range_start, range_end);
+    int table_hw = 256;
+    for (uint64_t i = 0; i < ifmap_size; i++) {
+      // input range is -8 ~ +8
+      float input =
+          ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
+      // float input = ((int)i % 10) * (((int)i % 2) ? 1 : -1) + 0.03 + (i %
+      // table_hw) * 0.002;  float input = dist(e2);  input = ((int)i %
+      // (range_end-2)) * (((int)i % 2) ? 1 : 1) + 0.03 + (i % table_hw) *
+      // 0.002; if (input < 1 && input > 0) {
+      //  input = 111.9;
+      //}
+      input_data[i] = convert_fp32_bf16(input);
+    }
+    input_data[0] = convert_fp32_bf16(0);
+    input_data[1] = convert_fp32_bf16(1);
+    input_data[2] = convert_fp32_bf16(-1);
+  }
+
+#ifdef DBG
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    printf("source if[%lu] bf16 %f 0x%x, log2f is %f\n", i, convert_bf16_fp32(input_data[i]),
+           input_data[i], floor(log2((convert_bf16_fp32(input_data[i])))));
+  }
+#endif /* ifdef DBG */
+}
+
+static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) {
+  // TODO: check more shape / align
+  cvk_chip_info_t chip_info = bmk->info;
+
+  uint32_t input_n = 1;
+  uint32_t input_c = chip_info.npu_num;
+  uint32_t input_h = 16;
+  uint32_t input_w = 16;
+  float epsilon = 0.01;
+  int range_start = -8;
+  int range_end = 8;
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    input_h = 4;
+    input_w = 8;
+  }
+
+  cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
+  cvk_tl_shape_t ofmap_shape = ifmap_shape;
+  uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
+  uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  uint64_t ifmap_bytesize = ifmap_size * data_type_size;
+  uint64_t ofmap_bytesize = ofmap_size * data_type_size;
+
+  // get lut table shape and size
+  cvk_tl_shape_t table_shape;
+  uint64_t table_bytesize = cvm_lut_tbl_bytesize(bmk, &table_shape, fmt);
+
+  cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *out = tl_ofmap_bf16;
+
+  // atan buf
+  cvk_tl_t *tl_y0_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_slope_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_invert_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_pos_neg_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+
+  // reciprocal buf
+  cvk_tl_t *tl_reciprocal_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_reciprocal_table_answer_mantissa = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+
+  // temp buf
+  cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_buf2 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_buf4 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1);
+
+  uint16_t *input_data = (uint16_t *)xmalloc(ifmap_bytesize);
+  uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
+
+  // for reciprocal
+  uint16_t *table_reciprocal_data = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *table_reciprocal_data_mantissa = (uint16_t *)xmalloc(table_bytesize);
+
+  // for atan
+  uint16_t *table_data_atan_y0 = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *table_data_atan_slope = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *table_data_atan_invert = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *table_data_atan_pos_neg = (uint16_t *)xmalloc(table_bytesize);
+
+  gen_input(input_data, ifmap_size, mode, range_start, range_end);
+  tl_lut_ref(ref_data, input_data, ifmap_shape);
+
+  cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape);
+  cvm_atan_tbl(table_data_atan_y0, table_data_atan_slope, table_data_atan_invert,
+               table_data_atan_pos_neg, &table_shape);
+
+  test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)input_data);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer, (uint8_t *)table_reciprocal_data);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer_mantissa,
+                           (uint8_t *)table_reciprocal_data_mantissa);
+
+  // prepare atan
+  test_put_tensor_g2l_comp(ctx, bmk, tl_y0_buf, (uint8_t *)table_data_atan_y0);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_slope_buf, (uint8_t *)table_data_atan_slope);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_invert_buf, (uint8_t *)table_data_atan_invert);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_pos_neg_buf, (uint8_t *)table_data_atan_pos_neg);
+
+  cvm_atan_emit(bmk, tl_ifmap, tl_buf, tl_buf2, tl_buf4, tl_y0_buf, tl_slope_buf, tl_invert_buf,
+                tl_pos_neg_buf, tl_reciprocal_table_answer, tl_reciprocal_table_answer_mantissa,
+                tl_ofmap_bf16, fmt);
+
+  test_submit_comp(ctx, bmk);
+
+  uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, out);
+  verify(ofmap_data, ref_data, input_data, ifmap_size, epsilon);
+
+  free_tl(bmk, tl_buf4);
+  free_tl(bmk, tl_buf2);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_reciprocal_table_answer_mantissa);
+  free_tl(bmk, tl_reciprocal_table_answer);
+  free_tl(bmk, tl_pos_neg_buf);
+  free_tl(bmk, tl_invert_buf);
+  free_tl(bmk, tl_slope_buf);
+  free_tl(bmk, tl_y0_buf);
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_ifmap);
+
+  free(table_data_atan_y0);
+  free(table_data_atan_slope);
+  free(table_data_atan_invert);
+  free(table_data_atan_pos_neg);
+  free(table_reciprocal_data);
+  free(table_reciprocal_data_mantissa);
+  free(input_data);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main() {
+  cvk_context_t *bmk = NULL;
+  int round_mode;
+  round_mode = set_store_feround();
+
+  CVI_RT_HANDLE ctx;
+  test_init(&ctx, &bmk);
+
+  // for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++)
+  // for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_ACCURACY; i++)
+  for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_U8; i++)
+  // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_U8; i++)
+  {
+    mode = static_cast<TEST_MODE>(i);
+    printf("test mode %d...\n", mode);
+    testbench(&ctx, bmk);
+  }
+  printf("pass\n");
+
+  test_exit(&ctx, bmk);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cvimath/tests/cvi1835/atan2_degree.cpp b/cvimath/tests/cvi1835/atan2_degree.cpp
new file mode 100644
index 000000000..f785c782c
--- /dev/null
+++ b/cvimath/tests/cvi1835/atan2_degree.cpp
@@ -0,0 +1,667 @@
+/**
+ * \breif atan2 is implemented by atan, you can refer
+ * [wiki](https://en.wikipedia.org/wiki/Atan2) for more details
+ */
+
+#include <cvimath_internal.h>
+#include <test_cvikernel_util.h>
+
+#define OUT
+#define IN
+#include <cfloat>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <random>
+#include <string>
+//#define DBG
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0,  // pre-data + fix compare
+  DATA_COMPARE_ACCURACY,     // generate \range_start to \range_end value that
+                             // check epsilon, default set x > 0, y > 0
+
+  DATA_COMPARE_ACCURACY_X_GT_0,         // atan(y/x), x > 0, y = 0
+  DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0,  // atan(y/x) + PI , x < 0 and y >= 0
+  DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0,  // atan(y/x) - PI , x < 0 and y < 0
+  DATA_COMPARE_ACCURACY_X_0_Y_GT_0,     // pi / 2, x = 0 and y > 0
+  DATA_COMPARE_ACCURACY_X_0_Y_LT_0,     // -pi / 2, x = 0 and y < 0
+  DATA_COMPARE_U8,                      // generate \range_start to \range_end value that check
+                                        // epsilon, result bf16->uint8_t
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+
+static uint16_t test_pattern[] = {
+    0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52, 0x3A6C, 0x3A83, 0x3A90,
+    0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF, 0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17,
+    0x3B1D, 0x3B24, 0x3B2A, 0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65,
+    0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90, 0x3B93, 0x3B97, 0x3B9A,
+    0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE, 0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1,
+    0x3BC5, 0x3BC8, 0x3BCB, 0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9,
+    0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03, 0x3C05, 0x3C06, 0x3C08,
+    0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12, 0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C,
+    0x3C1D, 0x3C1F, 0x3C21, 0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F,
+    0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E, 0x3C40, 0x3C41, 0x3C43,
+    0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D, 0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57,
+    0x3C58, 0x3C5A, 0x3C5C, 0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A,
+    0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79, 0x3C7B, 0x3C7C, 0x3C7E,
+    0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84, 0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89,
+    0x3C8A, 0x3C8A, 0x3C8B, 0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93,
+    0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A, 0x3C9B, 0x3C9C, 0x3C9C,
+    0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1, 0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6,
+    0x3CA7, 0x3CA8, 0x3CA9, 0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0,
+    0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8, 0x3CB8, 0x3CB9, 0x3CBA,
+    0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF, 0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4,
+    0x3CC5, 0x3CC5, 0x3CC6, 0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE,
+    0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5, 0x3CD6, 0x3CD7, 0x3CD7,
+    0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC, 0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1,
+    0x3CE2, 0x3CE3, 0x3CE4, 0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB,
+    0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2, 0x3CF3, 0x3CF4, 0x3CF5,
+    0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA, 0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF,
+    0x3D00, 0x3D00, 0x3D01, 0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04,
+    0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08, 0x3D08, 0x3D09, 0x3D09,
+    0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C, 0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E,
+    0x3D0F, 0x3D0F, 0x3D0F, 0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13,
+    0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17, 0x3D17, 0x3D18, 0x3D18,
+    0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A, 0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D,
+    0x3D1D, 0x3D1E, 0x3D1E, 0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22,
+    0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25, 0x3D26, 0x3D26, 0x3D27,
+    0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29, 0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C,
+    0x3D2C, 0x3D2C, 0x3D2D, 0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31,
+    0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34, 0x3D35, 0x3D35, 0x3D35,
+    0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38, 0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A,
+    0x3D3B, 0x3D3B, 0x3D3C, 0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F,
+    0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43, 0x3D43, 0x3D44, 0x3D44,
+    0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47, 0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49,
+    0x3D4A, 0x3D4A, 0x3D4A, 0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E,
+    0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52, 0x3D52, 0x3D53, 0x3D53,
+    0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55, 0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58,
+    0x3D58, 0x3D59, 0x3D59, 0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D,
+    0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60, 0x3D61, 0x3D61, 0x3D62,
+    0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64, 0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67,
+    0x3D67, 0x3D67, 0x3D68, 0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C,
+    0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F, 0x3D70, 0x3D70, 0x3D70,
+    0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73, 0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75,
+    0x3D76, 0x3D76, 0x3D77, 0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A,
+    0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E, 0x3D7E, 0x3D7F, 0x3D7F,
+    0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82,
+    0x3D82, 0x3D82, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85,
+    0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D87, 0x3D87, 0x3D87,
+    0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89,
+    0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C,
+    0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E,
+    0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91,
+    0x3D91, 0x3D91, 0x3D91, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93,
+    0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95, 0x3D95, 0x3D96, 0x3D96,
+    0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98,
+    0x3D98, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B,
+    0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9D, 0x3D9D, 0x3D9D,
+    0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0,
+    0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2,
+    0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4,
+    0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7,
+    0x3DA7, 0x3DA7, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9,
+    0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAC, 0x3DAC,
+    0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE,
+    0x3DAE, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1,
+    0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3,
+    0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6,
+    0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8,
+    0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBB,
+    0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD,
+    0x3DBD, 0x3DBD, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF,
+    0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC2, 0x3DC2, 0x3DC2,
+    0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4,
+    0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7,
+    0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9, 0x3DC9, 0x3DC9, 0x3DC9,
+    0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC,
+    0x3DCC, 0x3DCC, 0x3DCC, 0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5,
+    0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE, 0x3DDF, 0x3DE0, 0x3DE1,
+    0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5,
+};
+
+static uint16_t golden_bf16[] = {
+    0x42b4, 0x42b4, 0x42b4, 0x42b4, 0x42b4, 0x42b4, 0x42b3, 0x42b3, 0x42b3, 0x42b3, 0x42b3, 0x42b3,
+    0x42b3, 0x42b3, 0x42b3, 0x42b3, 0x42b2, 0x42b2, 0x42b2, 0x42b2, 0x42b2, 0x42b2, 0x42b2, 0x42b2,
+    0x42b2, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42b1, 0x42af,
+    0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42af, 0x42ae, 0x42ae, 0x42ae,
+    0x42ae, 0x42ae, 0x42ae, 0x42ae, 0x42ae, 0x42ae, 0x42ad, 0x42ad, 0x42ad, 0x42ad, 0x42ad, 0x42ad,
+    0x42ad, 0x42ad, 0x42ad, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac, 0x42ac,
+    0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42ab, 0x42aa, 0x42aa, 0x42aa,
+    0x42aa, 0x42aa, 0x42aa, 0x42aa, 0x42aa, 0x42a9, 0x42a9, 0x42a9, 0x42a9, 0x42a9, 0x42a9, 0x42a9,
+    0x42a9, 0x42a9, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a7, 0x42a6, 0x42a6,
+    0x42a6, 0x42a6, 0x42a6, 0x42a6, 0x42a6, 0x42a6, 0x42a5, 0x42a5, 0x42a5, 0x42a5, 0x42a5, 0x42a5,
+    0x42a5, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a4, 0x42a3, 0x42a3,
+    0x42a3, 0x42a3, 0x42a3, 0x42a3, 0x42a3, 0x42a2, 0x42a2, 0x42a2, 0x42a2, 0x42a2, 0x42a2, 0x42a2,
+    0x42a2, 0x42a2, 0x42a1, 0x42a1, 0x42a1, 0x42a1, 0x42a1, 0x42a1, 0x42a1, 0x42a0, 0x42a0, 0x42a0,
+    0x42a0, 0x42a0, 0x42a0, 0x42a0, 0x42a0, 0x429e, 0x429e, 0x429e, 0x429e, 0x429e, 0x429e, 0x429d,
+    0x429d, 0x429d, 0x429d, 0x429d, 0x429d, 0x429d, 0x429d, 0x429d, 0x429c, 0x429c, 0x429c, 0x429c,
+    0x429c, 0x429b, 0x429b, 0x429b, 0x429b, 0x429b, 0x429b, 0x429b, 0x429b, 0x429a, 0x429a, 0x429a,
+    0x429a, 0x429a, 0x429a, 0x4299, 0x4299, 0x4299, 0x4299, 0x4299, 0x4299, 0x4299, 0x4299, 0x4298,
+    0x4298, 0x4298, 0x4298, 0x4298, 0x4298, 0x4297, 0x4297, 0x4297, 0x4297, 0x4297, 0x4297, 0x4296,
+    0x4296, 0x4296, 0x4296, 0x4296, 0x4295, 0x4295, 0x4295, 0x4295, 0x4295, 0x4295, 0x4295, 0x4295,
+    0x4294, 0x4294, 0x4294, 0x4294, 0x4294, 0x4294, 0x4293, 0x4293, 0x4293, 0x4293, 0x4293, 0x4293,
+    0x4292, 0x4292, 0x4292, 0x4292, 0x4292, 0x4291, 0x4291, 0x4291, 0x4291, 0x4291, 0x4291, 0x4291,
+    0x4291, 0x428f, 0x428f, 0x428f, 0x428e, 0x428e, 0x428e, 0x428e, 0x428e, 0x428e, 0x428e, 0x428e,
+    0x428d, 0x428d, 0x428d, 0x428d, 0x428c, 0x428c, 0x428c, 0x428c, 0x428c, 0x428c, 0x428c, 0x428c,
+    0x428b, 0x428b, 0x428b, 0x428a, 0x428a, 0x428a, 0x428a, 0x428a, 0x428a, 0x428a, 0x4289, 0x4289,
+    0x4289, 0x4288, 0x4288, 0x4288, 0x4288, 0x4288, 0x4288, 0x4287, 0x4287, 0x4287, 0x4287, 0x4287,
+    0x4286, 0x4286, 0x4286, 0x4286, 0x4286, 0x4286, 0x4286, 0x4286, 0x4285, 0x4285, 0x4285, 0x4285,
+    0x4285, 0x4285, 0x4285, 0x4285, 0x4285, 0x4284, 0x4284, 0x4284, 0x4284, 0x4284, 0x4283, 0x4283,
+    0x4282, 0x4282, 0x4282, 0x4282, 0x4282, 0x4281, 0x4281, 0x4281, 0x4281, 0x4281, 0x4281, 0x4281,
+    0x4280, 0x4280, 0x4280, 0x427e, 0x427e, 0x427e, 0x427e, 0x427e, 0x427c, 0x427c, 0x427c, 0x427a,
+    0x427a, 0x427a, 0x427a, 0x427a, 0x427a, 0x4278, 0x4278, 0x4278, 0x4277, 0x4277, 0x4277, 0x4277,
+    0x4277, 0x4277, 0x4275, 0x4275, 0x4275, 0x4273, 0x4273, 0x4273, 0x4273, 0x4273, 0x4271, 0x4271,
+    0x4271, 0x4270, 0x4270, 0x4270, 0x4270, 0x4270, 0x4270, 0x4270, 0x426e, 0x426c, 0x426c, 0x426c,
+    0x426c, 0x426c, 0x426a, 0x426a, 0x426a, 0x426a, 0x4269, 0x4269, 0x4269, 0x4269, 0x4269, 0x4267,
+    0x4267, 0x4266, 0x4266, 0x4266, 0x4266, 0x4266, 0x4264, 0x4264, 0x4264, 0x4262, 0x4262, 0x4262,
+    0x4262, 0x4261, 0x4261, 0x4261, 0x425f, 0x425f, 0x425f, 0x425f, 0x425f, 0x425e, 0x425e, 0x425c,
+    0x425c, 0x425c, 0x425c, 0x425c, 0x425b, 0x425b, 0x425b, 0x4259, 0x4259, 0x4259, 0x4259, 0x4257,
+    0x4257, 0x4257, 0x4256, 0x4256, 0x4256, 0x4256, 0x4256, 0x4253, 0x4253, 0x4253, 0x4253, 0x4253,
+    0x4253, 0x4253, 0x4250, 0x4250, 0x4250, 0x4250, 0x4250, 0x424f, 0x424f, 0x424d, 0x424d, 0x424d,
+    0x424d, 0x424d, 0x424b, 0x424b, 0x424b, 0x424b, 0x424b, 0x4249, 0x4249, 0x4249, 0x4248, 0x4248,
+    0x4248, 0x4248, 0x4247, 0x4247, 0x4247, 0x4245, 0x4245, 0x4244, 0x4244, 0x4244, 0x4243, 0x4243,
+    0x4241, 0x4241, 0x4241, 0x4240, 0x4240, 0x4240, 0x4240, 0x4240, 0x423e, 0x423e, 0x423e, 0x423e,
+    0x423b, 0x423b, 0x423b, 0x423b, 0x423b, 0x423a, 0x423a, 0x423a, 0x4239, 0x4239, 0x4237, 0x4237,
+    0x4237, 0x4236, 0x4236, 0x4236, 0x4236, 0x4236, 0x4235, 0x4235, 0x4234, 0x4234, 0x4232, 0x4232,
+    0x4232, 0x4232, 0x4232, 0x4231, 0x4231, 0x4231, 0x422f, 0x422f, 0x422d, 0x422d, 0x422d, 0x422d,
+    0x422d, 0x422c, 0x422c, 0x422c, 0x422a, 0x422a, 0x422a, 0x422a, 0x4228, 0x4228, 0x4228, 0x4228,
+    0x4228, 0x4227, 0x4227, 0x4227, 0x4225, 0x4225, 0x4223, 0x4223, 0x4223, 0x4223, 0x4223, 0x4223,
+    0x4223, 0x4221, 0x4220, 0x4220, 0x4220, 0x4220, 0x421f, 0x421f, 0x421f, 0x421d, 0x421d, 0x421d,
+    0x421d, 0x421d, 0x421b, 0x421b, 0x421b, 0x421b, 0x421b, 0x4219, 0x4219, 0x4218, 0x4218, 0x4218,
+    0x4218, 0x4218, 0x4215, 0x4215, 0x4215, 0x4215, 0x4215, 0x4215, 0x4215, 0x4213, 0x4213, 0x4213,
+    0x4212, 0x4212, 0x4211, 0x4211, 0x4211, 0x420f, 0x420f, 0x420f, 0x420f, 0x420d, 0x420d, 0x420d,
+    0x420c, 0x420c, 0x420c, 0x420c, 0x420c, 0x420a, 0x420a, 0x4209, 0x4209, 0x4209, 0x4209, 0x4209,
+    0x4207, 0x4207, 0x4207, 0x4206, 0x4206, 0x4206, 0x4206, 0x4204, 0x4204, 0x4204, 0x4202, 0x4202,
+    0x4202, 0x4202, 0x4202, 0x4201, 0x4201, 0x41fe, 0x41fe, 0x41fe, 0x41fe, 0x41fe, 0x41fb, 0x41fb,
+    0x41fb, 0x41fb, 0x41f8, 0x41f8, 0x41f8, 0x41f8, 0x41f8, 0x41f4, 0x41f1, 0x41f1, 0x41f1, 0x41f1,
+    0x41f1, 0x41f1, 0x41f1, 0x41ed, 0x41ed, 0x41ed, 0x41ed, 0x41ed, 0x41ea, 0x41ea, 0x41ea, 0x41e6,
+    0x41e6, 0x41e6, 0x41e3, 0x41e3, 0x41e3, 0x41e3, 0x41e3, 0x41df, 0x41df, 0x41df, 0x41df, 0x41dc,
+    0x41dc, 0x41dc, 0x41dc, 0x41dc, 0x41d8, 0x41d8, 0x41d8, 0x41d8, 0x41d5, 0x41d5, 0x41d5, 0x41d5,
+    0x41d5, 0x41d1, 0x41d1, 0x41d1, 0x41cd, 0x41cd, 0x41cd, 0x41cd, 0x41cd, 0x41cd, 0x41cd, 0x41c9,
+    0x41c9, 0x41c9, 0x41c6, 0x41c6, 0x41c6, 0x41c6, 0x41c6, 0x41c6, 0x41c6, 0x41c2, 0x41c2, 0x41be,
+    0x41be, 0x41be, 0x41be, 0x41be, 0x41be, 0x41ba, 0x41ba, 0x41ba, 0x41ba, 0x41ba, 0x41b6, 0x41b6,
+    0x41b6, 0x41b6, 0x41b6, 0x41b6, 0x41b6, 0x41b2, 0x41b2, 0x41ae, 0x41ae, 0x41ae, 0x41ae, 0x41ae,
+    0x41ae, 0x41ae, 0x41ae, 0x41aa, 0x41aa, 0x41aa, 0x41aa, 0x41aa, 0x41a6, 0x41a6, 0x41a6, 0x41a6,
+    0x41a6, 0x41a2, 0x41a2, 0x41a2, 0x41a2, 0x419e, 0x419e, 0x419e, 0x419e, 0x419e, 0x419e, 0x419a,
+    0x419a, 0x419a, 0x419a, 0x419a, 0x4196, 0x4196, 0x4196, 0x4196, 0x4196, 0x4196, 0x4196, 0x4196,
+    0x4196, 0x4192, 0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418e, 0x418a,
+    0x418a, 0x418a, 0x418a, 0x418a, 0x4186, 0x4186, 0x4186, 0x4186, 0x4186, 0x4186, 0x4186, 0x4181,
+    0x4181, 0x4181, 0x4181, 0x4181, 0x417a, 0x417a, 0x417a, 0x417a, 0x417a, 0x417a, 0x417a, 0x417a,
+    0x4172, 0x4172, 0x4172, 0x4172, 0x4172, 0x4169, 0x4169, 0x4169, 0x4169, 0x4169, 0x4169, 0x4161,
+    0x4161, 0x4161, 0x4161, 0x4161, 0x4161, 0x4158, 0x4158, 0x4158, 0x4158, 0x4158, 0x4158, 0x4158,
+    0x4158, 0x4158, 0x414f, 0x414f, 0x414f, 0x414f, 0x414f, 0x4147, 0x4147, 0x4147, 0x4147, 0x4147,
+    0x4147, 0x4147, 0x4147, 0x413e, 0x413e, 0x413e, 0x413e, 0x413e, 0x4135, 0x4135, 0x4135, 0x4135,
+    0x4135, 0x4135, 0x4135, 0x4135, 0x4135, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x4123,
+    0x4123, 0x4123, 0x4123, 0x4123, 0x4123, 0x4123, 0x4123, 0x4123, 0x411a, 0x411a, 0x411a, 0x411a,
+    0x411a, 0x411a, 0x4111, 0x4111, 0x4111, 0x4111, 0x4111, 0x4111, 0x4111, 0x4111, 0x4108, 0x4108,
+    0x4108, 0x4108, 0x4108, 0x4108, 0x4108, 0x4108, 0x40ff, 0x40ff, 0x40ff, 0x40ff, 0x40ff, 0x40ff,
+    0x40ff, 0x40ff, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40ed, 0x40db, 0x40db,
+    0x40db, 0x40db, 0x40db, 0x40db, 0x40db, 0x40db, 0x40c9, 0x40c9, 0x40c9, 0x40c9, 0x40c9, 0x40c9,
+    0x40c9, 0x40c9, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7, 0x40b7,
+    0x40a5, 0x40a5, 0x40a5, 0x40a5, 0x40a5, 0x40a5, 0x40a5, 0x4092, 0x4092, 0x4092, 0x4092, 0x4092,
+    0x4092, 0x4092, 0x4092, 0x4092, 0x4080, 0x4080, 0x4080, 0x4080, 0x4080, 0x4080, 0x4080, 0x4080,
+    0x4080, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x405c, 0x4037, 0x4037,
+    0x4037, 0x4037, 0x4037, 0x4037, 0x4037, 0x4037, 0x4037, 0x4013, 0x4013, 0x4013, 0x4013, 0x4013,
+    0x4013, 0x4013, 0x4013, 0x4013, 0x4013, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc, 0x3fdc,
+    0x3fdc, 0x3fdc, 0x3fdc, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93,
+    0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x3f13, 0x0,    0x0,
+    0x0,    0x0,    0x0,    0x0,
+};
+
+// <! gen atan2 f(y, x) = 2 * atan(y / (pow(x*x+y*y, 0.5) + x))
+static double _gen_atan2_degree(float y, float x) { return atan2(y, x) * 180 / M_PI; }
+
+static void tl_lut_ref(uint16_t *ofmap, uint16_t *ifmap, uint16_t *ifmap2,
+                       cvk_tl_shape_t ifmap_shape) {
+  assert(ofmap);
+
+  for (uint32_t i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    float y = convert_bf16_fp32(ifmap2[i]);
+    float x = convert_bf16_fp32(ifmap[i]);
+    double v = _gen_atan2_degree(y, x);
+    ofmap[i] = convert_fp32_bf16(v);
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      ofmap[i] = golden_bf16[i];
+    } else if (mode == DATA_COMPARE_U8) {
+      ofmap[i] = (uint8_t)convert_bf16_s8(ofmap[i]);
+    }
+  }
+}
+
+static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint16_t *ifmap, uint16_t *ifmap2,
+                   uint64_t ifmap_size, float epsilon) {
+  uint64_t size = ifmap_size;
+
+  for (uint64_t i = 0; i < size; i++) {
+    bool is_close;
+    uint16_t ref = ref_data[i];
+    uint16_t ofmap_data_bf16;
+    float ref_f;
+    float ofmap_data_f;
+
+    ref_f = convert_bf16_fp32(ref);
+    ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
+    ofmap_data_bf16 = ofmap_data[i];
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      is_close = ofmap_data[i] == ref;
+    } else {
+      is_close = fabs(ref_f - ofmap_data_f) < epsilon;
+      if (abs(ofmap_data_f) * epsilon == 0) {
+        // https://stackoverflow.com/questions/19837576/comparing-floating-point-number-to-zero
+        is_close = abs(ref_f) < epsilon;
+      } else {
+        is_close = fabs(ref_f - ofmap_data_f) / fabs(std::max(ref_f, ofmap_data_f)) < epsilon;
+      }
+    }
+
+    if (!is_close) {
+      float y = convert_bf16_fp32(ifmap2[i]);
+      float x = convert_bf16_fp32(ifmap[i]);
+      fprintf(stderr,
+              "comparing failed at ofmap_data[%lu]\n"
+              "\tgot %x, exp %x, fp32: got %f exp %f, atan2(%f, %f) = %f"
+              "\ty %f(0x%x), x %f(0x%x)\n",
+              i, ofmap_data_bf16, ref, ofmap_data_f, ref_f, y, x, _gen_atan2_degree(y, x), y,
+              ifmap2[i], x, ifmap[i]);
+      exit(-1);
+    }
+  }
+
+  return true;
+}
+
+static void _gen_input(uint16_t *input_data, uint64_t ifmap_size, int range_start, int range_end) {
+  std::random_device rd;
+  std::mt19937 e2(rd());
+  std::uniform_real_distribution<> dist(range_start, range_end);
+
+  float LO = pow(2, range_start);
+  float HI = pow(2, range_end);
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    // input range is -8 ~ +8
+    int table_hw = 256;
+    float input =
+        ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
+    input = ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : 1) + 0.03 + (i % table_hw) * 0.002;
+    input_data[i] = convert_fp32_bf16(input);
+    input = dist(e2);
+    input = LO + static_cast<float>(rand()) / (static_cast<float>(RAND_MAX / (HI - LO)));
+  }
+}
+
+static void gen_input(uint16_t *x, uint16_t *y, uint64_t ifmap_size, TEST_MODE mode,
+                      int range_start, int range_end) {
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(x, &test_pattern, sizeof(test_pattern));
+  } else {
+    range_start = abs(range_start);
+    range_end = abs(range_end);
+    _gen_input(x, ifmap_size, range_start, range_end);
+  }
+
+  // invert for test
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    y[i] = x[(ifmap_size - 1) - i];
+  }
+
+  if (mode == DATA_COMPARE_ACCURACY_X_GT_0) {
+    // y = any
+    uint32_t i = 0;
+    for (; i < ifmap_size / 4; i++) {
+      // y < 0
+      y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
+      y[i + ifmap_size / 4] = convert_fp32_bf16(0);
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0) {
+    // x < 0 and y >= 0
+    for (uint32_t i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i]));
+    }
+
+    for (uint32_t i = 0; i < ifmap_size / 4; i++) {
+      y[i + ifmap_size / 4] = convert_fp32_bf16(0);
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0) {
+    // x < 0 and y < 0
+    for (uint32_t i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i]));
+      y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_GT_0) {
+    // pi / 2, x = 0 and y > 0
+    for (uint32_t i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(0);
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_LT_0) {
+    // -pi / 2, x = 0 and y < 0
+    for (uint32_t i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(0);
+      y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
+    }
+  }
+
+  if (mode != PRE_DATA_COMPARE_FIX) {
+    int i = 0;
+    x[i] = convert_fp32_bf16(-10.0);
+    y[i++] = convert_fp32_bf16(6.0);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(19.000000);
+    y[i] = convert_fp32_bf16(5.000000);
+    x[i++] = convert_fp32_bf16(-125.000000);
+    y[i] = convert_fp32_bf16(1.070312);
+    x[i++] = convert_fp32_bf16(0.498046);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    x[i] = convert_fp32_bf16(424.000);
+    y[i++] = convert_fp32_bf16(-1.00);
+    x[i] = convert_fp32_bf16(2.484375);
+    y[i++] = convert_fp32_bf16(-7.531250);
+    x[i] = convert_fp32_bf16(-2.484375);
+    y[i++] = convert_fp32_bf16(-7.531250);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(7.531250);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(-7.531250);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(0);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(0.394531);
+    y[i] = convert_fp32_bf16(-4.000000);
+    x[i++] = convert_fp32_bf16(-64.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-40.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-53.000000);
+    y[i] = convert_fp32_bf16(-9.000000);
+    x[i++] = convert_fp32_bf16(-91.000000);
+    y[i] = convert_fp32_bf16(12.000000);
+    x[i++] = convert_fp32_bf16(-164.000000);
+    y[i] = convert_fp32_bf16(-20.000000);
+    x[i++] = convert_fp32_bf16(-320.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-71.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-155.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-247.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-118.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-54.000000);
+    y[i] = convert_fp32_bf16(-5.000000);
+    x[i++] = convert_fp32_bf16(-392.000000);
+    y[i] = convert_fp32_bf16(-37.000000);
+    x[i++] = convert_fp32_bf16(-520.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-19.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-10.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-2.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-14.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-2.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-21.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-14.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-17.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-17.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-10.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-14.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-2.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-41.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-69.000000);
+    y[i] = convert_fp32_bf16(4.000000);
+    x[i++] = convert_fp32_bf16(-86.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-41.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-34.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-41.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-136.000000);
+    y[i] = convert_fp32_bf16(-3.000000);
+    x[i++] = convert_fp32_bf16(-79.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-38.000000);
+    y[i] = convert_fp32_bf16(5.000000);
+    x[i++] = convert_fp32_bf16(-173.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-78.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-60.000000);
+    y[i] = convert_fp32_bf16(3.000000);
+    x[i++] = convert_fp32_bf16(-123.000000);
+    y[i] = convert_fp32_bf16(-9.000000);
+    x[i++] = convert_fp32_bf16(-280.000000);
+    y[i] = convert_fp32_bf16(3.000000);
+    x[i++] = convert_fp32_bf16(-39.000000);
+    y[i] = convert_fp32_bf16(2.000000);
+    x[i++] = convert_fp32_bf16(-524.000000);
+    y[i] = convert_fp32_bf16(11.000000);
+    x[i++] = convert_fp32_bf16(-376.000000);
+    y[i] = convert_fp32_bf16(5.000000);
+    x[i++] = convert_fp32_bf16(-131.000000);
+    y[i] = convert_fp32_bf16(11.000000);
+    x[i++] = convert_fp32_bf16(-324.000000);
+    y[i] = convert_fp32_bf16(9.000000);
+    x[i++] = convert_fp32_bf16(-125.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-92.000000);
+    y[i] = convert_fp32_bf16(-7.000000);
+    x[i++] = convert_fp32_bf16(-233.000000);
+    y[i] = convert_fp32_bf16(10.000000);
+    x[i++] = convert_fp32_bf16(-170.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-10.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-23.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(-3.000000);
+    x[i++] = convert_fp32_bf16(-37.000000);
+
+    y[i] = convert_fp32_bf16(-9);
+    x[i++] = convert_fp32_bf16(-1);
+
+    y[i] = convert_fp32_bf16(7.0);
+    x[i++] = convert_fp32_bf16(-1);
+
+    y[i] = convert_fp32_bf16(0);
+    x[i++] = convert_fp32_bf16(-1);
+  }
+
+#ifdef DBG
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    printf("source[%ld] y %f x %f\n", i, convert_bf16_fp32(y[i]), convert_bf16_fp32(x[i]));
+  }
+#endif /* ifdef DBG */
+}
+
+static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) {
+  // TODO: check more shape / align
+  cvk_chip_info_t chip_info = bmk->info;
+
+  uint32_t input_n = 1;
+  uint32_t input_c = chip_info.npu_num;
+  uint32_t input_h = 16;
+  uint32_t input_w = 16;
+  float epsilon = 0.2;
+  int range_start = -8;
+  int range_end = 8;
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    input_h = 4;
+    input_w = 8;
+  }
+
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+
+  cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
+  cvk_tl_shape_t ofmap_shape = ifmap_shape;
+
+  // get lut table shape and size
+  cvk_tl_shape_t table_shape;
+  uint64_t table_bytesize = cvm_lut_tbl_bytesize(bmk, &table_shape, fmt);
+
+  // get input / output size
+  uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
+  uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
+  int data_type_size = bytesize_of_fmt(fmt);
+  uint64_t ifmap_bytesize = ifmap_size * data_type_size;
+  uint64_t ofmap_bytesize = ofmap_size * data_type_size;
+
+  // atan2 was two inputs
+  cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_ifmap2 = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *out = tl_ofmap_bf16;
+
+  // atan buf
+  cvk_tl_t *tl_y0_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_invert_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_pos_neg_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+
+  // reciprocal buf
+  cvk_tl_t *tl_reciprocal_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_reciprocal_table_answer_mantissa = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+
+  // temp buf
+  cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_buf2 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_buf3 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1);
+
+  uint16_t *input_data = (uint16_t *)xmalloc(ifmap_bytesize);
+  uint16_t *input_data2 = (uint16_t *)xmalloc(ifmap_bytesize);
+  uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
+
+  // for reciprocal
+  uint16_t *table_reciprocal_data = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *table_reciprocal_data_mantissa = (uint16_t *)xmalloc(table_bytesize);
+
+  // for atan
+  uint16_t *table_data_atan_y0 = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *table_data_atan_invert = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *table_data_atan_pos_neg = (uint16_t *)xmalloc(table_bytesize);
+
+  // for search '0' index
+  uint16_t *idx_0_table_data = (uint16_t *)xmalloc(table_bytesize);
+
+  // init input / ref
+  // input_data is x, input_data2 is y
+  gen_input(input_data, input_data2, ifmap_size, mode, range_start, range_end);
+  tl_lut_ref(ref_data, input_data, input_data2, ifmap_shape);
+
+  // init lut table
+  cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape);
+  cvm_atan_fast_degree_tbl(table_data_atan_y0, table_data_atan_invert, table_data_atan_pos_neg,
+                           &table_shape);
+  cvm_gen_0_tbl(idx_0_table_data, &table_shape);
+
+  // sys->local
+  test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)input_data);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap2, (uint8_t *)input_data2);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer, (uint8_t *)table_reciprocal_data);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer_mantissa,
+                           (uint8_t *)table_reciprocal_data_mantissa);
+
+  test_put_tensor_g2l_comp(ctx, bmk, tl_y0_buf, (uint8_t *)table_data_atan_y0);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_invert_buf, (uint8_t *)table_data_atan_invert);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_pos_neg_buf, (uint8_t *)table_data_atan_pos_neg);
+
+  cvm_atan2_fast_degree_emit(bmk, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf,
+                             tl_invert_buf, tl_pos_neg_buf, tl_reciprocal_table_answer,
+                             tl_reciprocal_table_answer_mantissa, OUT tl_ofmap_bf16, fmt);
+
+  uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, out);
+  verify(ofmap_data, ref_data, input_data, input_data2, ifmap_size, epsilon);
+
+  free_tl(bmk, tl_buf3);
+  free_tl(bmk, tl_buf2);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_reciprocal_table_answer_mantissa);
+  free_tl(bmk, tl_reciprocal_table_answer);
+  free_tl(bmk, tl_pos_neg_buf);
+  free_tl(bmk, tl_invert_buf);
+  free_tl(bmk, tl_y0_buf);
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_ifmap2);
+  free_tl(bmk, tl_ifmap);
+
+  free(table_data_atan_y0);
+  free(idx_0_table_data);
+  free(table_data_atan_invert);
+  free(table_data_atan_pos_neg);
+  free(table_reciprocal_data);
+  free(table_reciprocal_data_mantissa);
+  free(input_data);
+  free(ref_data);
+  free(ofmap_data);
+  free(input_data2);
+}
+
+int main() {
+  CVI_RT_HANDLE ctx;
+  cvk_context_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  // for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++)
+  // for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_ACCURACY; i++) {
+  // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_U8; i++) {
+  // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_ACCURACY_X_GT_0; i++)
+  // {
+  for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_U8; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf("test mode %d...\n", mode);
+    testbench(&ctx, bmk);
+  }
+  printf("pass\n");
+
+  test_exit(&ctx, bmk);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cvimath/tests/cvi1835/atan2_radian.cpp b/cvimath/tests/cvi1835/atan2_radian.cpp
new file mode 100644
index 000000000..3d2189983
--- /dev/null
+++ b/cvimath/tests/cvi1835/atan2_radian.cpp
@@ -0,0 +1,719 @@
+/**
+ * \breif atan2 is implemented by atan, you can refer
+ * [wiki](https://en.wikipedia.org/wiki/Atan2) for more details
+ */
+
+#include <cvimath_internal.h>
+#include <test_cvikernel_util.h>
+
+#define OUT
+#define IN
+#include <cfloat>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <random>
+#include <string>
+//#define DBG
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0,  // pre-data + fix compare
+  DATA_COMPARE_ACCURACY,     // generate \range_start to \range_end value that
+                             // check epsilon, default set x > 0, y > 0
+
+  DATA_COMPARE_ACCURACY_X_GT_0,         // atan(y/x), x > 0, y = 0
+  DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0,  // atan(y/x) + PI , x < 0 and y >= 0
+  DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0,  // atan(y/x) - PI , x < 0 and y < 0
+  DATA_COMPARE_ACCURACY_X_0_Y_GT_0,     // pi / 2, x = 0 and y > 0
+  DATA_COMPARE_ACCURACY_X_0_Y_LT_0,     // -pi / 2, x = 0 and y < 0
+  DATA_COMPARE_U8,                      // generate \range_start to \range_end value that check
+                                        // epsilon, result bf16->uint8_t
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+
+static uint16_t test_pattern[] = {
+    0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52, 0x3A6C, 0x3A83, 0x3A90,
+    0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF, 0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17,
+    0x3B1D, 0x3B24, 0x3B2A, 0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65,
+    0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90, 0x3B93, 0x3B97, 0x3B9A,
+    0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE, 0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1,
+    0x3BC5, 0x3BC8, 0x3BCB, 0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9,
+    0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03, 0x3C05, 0x3C06, 0x3C08,
+    0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12, 0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C,
+    0x3C1D, 0x3C1F, 0x3C21, 0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F,
+    0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E, 0x3C40, 0x3C41, 0x3C43,
+    0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D, 0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57,
+    0x3C58, 0x3C5A, 0x3C5C, 0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A,
+    0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79, 0x3C7B, 0x3C7C, 0x3C7E,
+    0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84, 0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89,
+    0x3C8A, 0x3C8A, 0x3C8B, 0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93,
+    0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A, 0x3C9B, 0x3C9C, 0x3C9C,
+    0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1, 0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6,
+    0x3CA7, 0x3CA8, 0x3CA9, 0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0,
+    0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8, 0x3CB8, 0x3CB9, 0x3CBA,
+    0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF, 0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4,
+    0x3CC5, 0x3CC5, 0x3CC6, 0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE,
+    0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5, 0x3CD6, 0x3CD7, 0x3CD7,
+    0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC, 0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1,
+    0x3CE2, 0x3CE3, 0x3CE4, 0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB,
+    0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2, 0x3CF3, 0x3CF4, 0x3CF5,
+    0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA, 0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF,
+    0x3D00, 0x3D00, 0x3D01, 0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04,
+    0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08, 0x3D08, 0x3D09, 0x3D09,
+    0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C, 0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E,
+    0x3D0F, 0x3D0F, 0x3D0F, 0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13,
+    0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17, 0x3D17, 0x3D18, 0x3D18,
+    0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A, 0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D,
+    0x3D1D, 0x3D1E, 0x3D1E, 0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22,
+    0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25, 0x3D26, 0x3D26, 0x3D27,
+    0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29, 0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C,
+    0x3D2C, 0x3D2C, 0x3D2D, 0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31,
+    0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34, 0x3D35, 0x3D35, 0x3D35,
+    0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38, 0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A,
+    0x3D3B, 0x3D3B, 0x3D3C, 0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F,
+    0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43, 0x3D43, 0x3D44, 0x3D44,
+    0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47, 0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49,
+    0x3D4A, 0x3D4A, 0x3D4A, 0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E,
+    0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52, 0x3D52, 0x3D53, 0x3D53,
+    0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55, 0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58,
+    0x3D58, 0x3D59, 0x3D59, 0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D,
+    0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60, 0x3D61, 0x3D61, 0x3D62,
+    0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64, 0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67,
+    0x3D67, 0x3D67, 0x3D68, 0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C,
+    0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F, 0x3D70, 0x3D70, 0x3D70,
+    0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73, 0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75,
+    0x3D76, 0x3D76, 0x3D77, 0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A,
+    0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E, 0x3D7E, 0x3D7F, 0x3D7F,
+    0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82,
+    0x3D82, 0x3D82, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85,
+    0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D87, 0x3D87, 0x3D87,
+    0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89,
+    0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C,
+    0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E,
+    0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91,
+    0x3D91, 0x3D91, 0x3D91, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93,
+    0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95, 0x3D95, 0x3D96, 0x3D96,
+    0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98,
+    0x3D98, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B,
+    0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9D, 0x3D9D, 0x3D9D,
+    0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0,
+    0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2,
+    0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4,
+    0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7,
+    0x3DA7, 0x3DA7, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9,
+    0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAC, 0x3DAC,
+    0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE,
+    0x3DAE, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1,
+    0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3,
+    0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6,
+    0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8,
+    0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBB,
+    0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD,
+    0x3DBD, 0x3DBD, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF,
+    0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC2, 0x3DC2, 0x3DC2,
+    0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4,
+    0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7,
+    0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9, 0x3DC9, 0x3DC9, 0x3DC9,
+    0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC,
+    0x3DCC, 0x3DCC, 0x3DCC, 0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5,
+    0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE, 0x3DDF, 0x3DE0, 0x3DE1,
+    0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5,
+};
+
+static uint16_t golden_bf16[] = {
+    0x3fc9, 0x3fc9, 0x3fc9, 0x3fc9, 0x3fc9, 0x3fc9, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8,
+    0x3fc8, 0x3fc8, 0x3fc8, 0x3fc8, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7, 0x3fc7,
+    0x3fc7, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc5, 0x3fc4,
+    0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc4, 0x3fc3, 0x3fc3, 0x3fc3,
+    0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc3, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1, 0x3fc1,
+    0x3fc1, 0x3fc1, 0x3fc1, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0, 0x3fc0,
+    0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbf, 0x3fbe, 0x3fbe, 0x3fbe,
+    0x3fbe, 0x3fbe, 0x3fbe, 0x3fbe, 0x3fbe, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc, 0x3fbc,
+    0x3fbc, 0x3fbc, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fbb, 0x3fba, 0x3fba,
+    0x3fba, 0x3fba, 0x3fba, 0x3fba, 0x3fba, 0x3fba, 0x3fb9, 0x3fb9, 0x3fb9, 0x3fb9, 0x3fb9, 0x3fb9,
+    0x3fb9, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb7, 0x3fb6, 0x3fb6,
+    0x3fb6, 0x3fb6, 0x3fb6, 0x3fb6, 0x3fb6, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5, 0x3fb5,
+    0x3fb5, 0x3fb5, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb4, 0x3fb2, 0x3fb2, 0x3fb2,
+    0x3fb2, 0x3fb2, 0x3fb2, 0x3fb2, 0x3fb2, 0x3fb1, 0x3fb1, 0x3fb1, 0x3fb1, 0x3fb1, 0x3fb1, 0x3fb0,
+    0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3fb0, 0x3faf, 0x3faf, 0x3faf, 0x3faf,
+    0x3faf, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fad, 0x3fac, 0x3fac, 0x3fac,
+    0x3fac, 0x3fac, 0x3fac, 0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3fab, 0x3faa,
+    0x3faa, 0x3faa, 0x3faa, 0x3faa, 0x3faa, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa9, 0x3fa7,
+    0x3fa7, 0x3fa7, 0x3fa7, 0x3fa7, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6, 0x3fa6,
+    0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa5, 0x3fa4, 0x3fa4, 0x3fa4, 0x3fa4, 0x3fa4, 0x3fa4,
+    0x3fa3, 0x3fa3, 0x3fa3, 0x3fa3, 0x3fa3, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1, 0x3fa1,
+    0x3fa1, 0x3fa0, 0x3fa0, 0x3fa0, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f, 0x3f9f,
+    0x3f9e, 0x3f9e, 0x3f9e, 0x3f9e, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d, 0x3f9d,
+    0x3f9c, 0x3f9c, 0x3f9c, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f9b, 0x3f99, 0x3f99,
+    0x3f99, 0x3f98, 0x3f98, 0x3f98, 0x3f98, 0x3f98, 0x3f98, 0x3f97, 0x3f97, 0x3f97, 0x3f97, 0x3f97,
+    0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f96, 0x3f95, 0x3f95, 0x3f94, 0x3f94,
+    0x3f94, 0x3f94, 0x3f94, 0x3f94, 0x3f94, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f93, 0x3f92, 0x3f92,
+    0x3f91, 0x3f91, 0x3f91, 0x3f91, 0x3f91, 0x3f90, 0x3f90, 0x3f90, 0x3f90, 0x3f90, 0x3f90, 0x3f90,
+    0x3f8f, 0x3f8f, 0x3f8f, 0x3f8e, 0x3f8e, 0x3f8e, 0x3f8e, 0x3f8e, 0x3f8d, 0x3f8d, 0x3f8d, 0x3f8c,
+    0x3f8c, 0x3f8c, 0x3f8c, 0x3f8c, 0x3f8c, 0x3f8b, 0x3f8b, 0x3f8b, 0x3f8a, 0x3f8a, 0x3f8a, 0x3f8a,
+    0x3f8a, 0x3f8a, 0x3f89, 0x3f89, 0x3f89, 0x3f88, 0x3f88, 0x3f88, 0x3f88, 0x3f88, 0x3f87, 0x3f87,
+    0x3f87, 0x3f86, 0x3f86, 0x3f86, 0x3f86, 0x3f86, 0x3f86, 0x3f86, 0x3f85, 0x3f84, 0x3f84, 0x3f84,
+    0x3f84, 0x3f84, 0x3f83, 0x3f83, 0x3f83, 0x3f83, 0x3f82, 0x3f82, 0x3f82, 0x3f82, 0x3f82, 0x3f81,
+    0x3f81, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7d, 0x3f7d, 0x3f7d,
+    0x3f7d, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f78, 0x3f78, 0x3f76,
+    0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f74, 0x3f74, 0x3f74, 0x3f72, 0x3f72, 0x3f72, 0x3f72, 0x3f71,
+    0x3f71, 0x3f71, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6c, 0x3f6c, 0x3f6c, 0x3f6c, 0x3f6c,
+    0x3f6c, 0x3f6c, 0x3f69, 0x3f69, 0x3f69, 0x3f69, 0x3f69, 0x3f67, 0x3f67, 0x3f65, 0x3f65, 0x3f65,
+    0x3f65, 0x3f65, 0x3f62, 0x3f62, 0x3f62, 0x3f62, 0x3f62, 0x3f61, 0x3f61, 0x3f61, 0x3f5f, 0x3f5f,
+    0x3f5f, 0x3f5f, 0x3f5e, 0x3f5e, 0x3f5e, 0x3f5c, 0x3f5c, 0x3f5b, 0x3f5b, 0x3f5b, 0x3f59, 0x3f59,
+    0x3f58, 0x3f58, 0x3f58, 0x3f57, 0x3f57, 0x3f57, 0x3f57, 0x3f57, 0x3f54, 0x3f54, 0x3f54, 0x3f54,
+    0x3f51, 0x3f51, 0x3f51, 0x3f51, 0x3f51, 0x3f50, 0x3f50, 0x3f50, 0x3f4e, 0x3f4e, 0x3f4d, 0x3f4d,
+    0x3f4d, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4a, 0x3f4a, 0x3f49, 0x3f49, 0x3f46, 0x3f46,
+    0x3f46, 0x3f46, 0x3f46, 0x3f45, 0x3f45, 0x3f45, 0x3f44, 0x3f44, 0x3f41, 0x3f41, 0x3f41, 0x3f41,
+    0x3f41, 0x3f40, 0x3f40, 0x3f40, 0x3f3e, 0x3f3e, 0x3f3e, 0x3f3e, 0x3f3c, 0x3f3c, 0x3f3c, 0x3f3c,
+    0x3f3c, 0x3f3a, 0x3f3a, 0x3f3a, 0x3f39, 0x3f39, 0x3f36, 0x3f36, 0x3f36, 0x3f36, 0x3f36, 0x3f36,
+    0x3f36, 0x3f34, 0x3f33, 0x3f33, 0x3f33, 0x3f33, 0x3f31, 0x3f31, 0x3f31, 0x3f30, 0x3f30, 0x3f30,
+    0x3f30, 0x3f30, 0x3f2d, 0x3f2d, 0x3f2d, 0x3f2d, 0x3f2d, 0x3f2b, 0x3f2b, 0x3f2a, 0x3f2a, 0x3f2a,
+    0x3f2a, 0x3f2a, 0x3f26, 0x3f26, 0x3f26, 0x3f26, 0x3f26, 0x3f26, 0x3f26, 0x3f25, 0x3f25, 0x3f25,
+    0x3f23, 0x3f23, 0x3f21, 0x3f21, 0x3f21, 0x3f20, 0x3f20, 0x3f20, 0x3f20, 0x3f1e, 0x3f1e, 0x3f1e,
+    0x3f1c, 0x3f1c, 0x3f1c, 0x3f1c, 0x3f1c, 0x3f1b, 0x3f1b, 0x3f19, 0x3f19, 0x3f19, 0x3f19, 0x3f19,
+    0x3f17, 0x3f17, 0x3f17, 0x3f15, 0x3f15, 0x3f15, 0x3f15, 0x3f14, 0x3f14, 0x3f14, 0x3f12, 0x3f12,
+    0x3f12, 0x3f12, 0x3f12, 0x3f10, 0x3f10, 0x3f0e, 0x3f0e, 0x3f0e, 0x3f0e, 0x3f0e, 0x3f0c, 0x3f0c,
+    0x3f0c, 0x3f0c, 0x3f0a, 0x3f0a, 0x3f0a, 0x3f0a, 0x3f0a, 0x3f08, 0x3f07, 0x3f07, 0x3f07, 0x3f07,
+    0x3f07, 0x3f07, 0x3f07, 0x3f05, 0x3f05, 0x3f05, 0x3f05, 0x3f05, 0x3f03, 0x3f03, 0x3f03, 0x3f01,
+    0x3f01, 0x3f01, 0x3efe, 0x3efe, 0x3efe, 0x3efe, 0x3efe, 0x3efa, 0x3efa, 0x3efa, 0x3efa, 0x3ef6,
+    0x3ef6, 0x3ef6, 0x3ef6, 0x3ef6, 0x3ef1, 0x3ef1, 0x3ef1, 0x3ef1, 0x3eed, 0x3eed, 0x3eed, 0x3eed,
+    0x3eed, 0x3ee9, 0x3ee9, 0x3ee9, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee5, 0x3ee1,
+    0x3ee1, 0x3ee1, 0x3edd, 0x3edd, 0x3edd, 0x3edd, 0x3edd, 0x3edd, 0x3edd, 0x3ed9, 0x3ed9, 0x3ed4,
+    0x3ed4, 0x3ed4, 0x3ed4, 0x3ed4, 0x3ed4, 0x3ed0, 0x3ed0, 0x3ed0, 0x3ed0, 0x3ed0, 0x3ecc, 0x3ecc,
+    0x3ecc, 0x3ecc, 0x3ecc, 0x3ecc, 0x3ecc, 0x3ec7, 0x3ec7, 0x3ec3, 0x3ec3, 0x3ec3, 0x3ec3, 0x3ec3,
+    0x3ec3, 0x3ec3, 0x3ec3, 0x3ebe, 0x3ebe, 0x3ebe, 0x3ebe, 0x3ebe, 0x3eba, 0x3eba, 0x3eba, 0x3eba,
+    0x3eba, 0x3eb5, 0x3eb5, 0x3eb5, 0x3eb5, 0x3eb1, 0x3eb1, 0x3eb1, 0x3eb1, 0x3eb1, 0x3eb1, 0x3eac,
+    0x3eac, 0x3eac, 0x3eac, 0x3eac, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8, 0x3ea8,
+    0x3ea8, 0x3ea3, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9a,
+    0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e91,
+    0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c,
+    0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e7b,
+    0x3e7b, 0x3e7b, 0x3e7b, 0x3e7b, 0x3e7b, 0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e71,
+    0x3e71, 0x3e71, 0x3e67, 0x3e67, 0x3e67, 0x3e67, 0x3e67, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e,
+    0x3e5e, 0x3e5e, 0x3e5e, 0x3e54, 0x3e54, 0x3e54, 0x3e54, 0x3e54, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a,
+    0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e40, 0x3e40, 0x3e40, 0x3e40, 0x3e40, 0x3e40, 0x3e36,
+    0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e2c, 0x3e2c, 0x3e2c, 0x3e2c,
+    0x3e2c, 0x3e2c, 0x3e22, 0x3e22, 0x3e22, 0x3e22, 0x3e22, 0x3e22, 0x3e22, 0x3e22, 0x3e18, 0x3e18,
+    0x3e18, 0x3e18, 0x3e18, 0x3e18, 0x3e18, 0x3e18, 0x3e0e, 0x3e0e, 0x3e0e, 0x3e0e, 0x3e0e, 0x3e0e,
+    0x3e0e, 0x3e0e, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3e04, 0x3df5, 0x3df5,
+    0x3df5, 0x3df5, 0x3df5, 0x3df5, 0x3df5, 0x3df5, 0x3de0, 0x3de0, 0x3de0, 0x3de0, 0x3de0, 0x3de0,
+    0x3de0, 0x3de0, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc, 0x3dcc,
+    0x3db8, 0x3db8, 0x3db8, 0x3db8, 0x3db8, 0x3db8, 0x3db8, 0x3da3, 0x3da3, 0x3da3, 0x3da3, 0x3da3,
+    0x3da3, 0x3da3, 0x3da3, 0x3da3, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f, 0x3d8f,
+    0x3d8f, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d75, 0x3d4d, 0x3d4d,
+    0x3d4d, 0x3d4d, 0x3d4d, 0x3d4d, 0x3d4d, 0x3d4d, 0x3d4d, 0x3d24, 0x3d24, 0x3d24, 0x3d24, 0x3d24,
+    0x3d24, 0x3d24, 0x3d24, 0x3d24, 0x3d24, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6, 0x3cf6,
+    0x3cf6, 0x3cf6, 0x3cf6, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4, 0x3ca4,
+    0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x0,    0x0,
+    0x0,    0x0,    0x0,
+};
+
+// <! gen atan2 f(y, x) = 2 * atan(y / (pow(x*x+y*y, 0.5) + x))
+static double _gen_atan2(float y, float x) { return atan2(y, x); }
+
+static void tl_lut_ref(uint16_t *ofmap, uint16_t *ifmap, uint16_t *ifmap2,
+                       cvk_tl_shape_t ifmap_shape) {
+  assert(ofmap);
+
+  uint32_t size = tl_shape_size(&ifmap_shape);
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    size = sizeof(golden_bf16) / sizeof(golden_bf16[0]);
+  }
+
+  for (uint32_t i = 0; i < size; i++) {
+    float y = convert_bf16_fp32(ifmap2[i]);
+    float x = convert_bf16_fp32(ifmap[i]);
+    double v = _gen_atan2(y, x);
+    ofmap[i] = convert_fp32_bf16(v);
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      ofmap[i] = golden_bf16[i];
+    } else if (mode == DATA_COMPARE_U8) {
+      ofmap[i] = (uint8_t)convert_bf16_s8(ofmap[i]);
+    }
+  }
+}
+
+static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint16_t *ifmap, uint16_t *ifmap2,
+                   uint64_t ifmap_size, float epsilon) {
+  uint64_t size = ifmap_size;
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    size = sizeof(golden_bf16) / sizeof(golden_bf16[0]);
+  }
+
+  int tolerant_max = 20;
+  tolerant_max = -1;
+  int tolerant_cnt = 0;
+  for (uint64_t i = 0; i < size; i++) {
+    bool is_close;
+    uint16_t ref = ref_data[i];
+    uint16_t ofmap_data_bf16;
+    float ref_f;
+    float ofmap_data_f;
+
+    ref_f = convert_bf16_fp32(ref);
+    ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
+    ofmap_data_bf16 = ofmap_data[i];
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      is_close = ofmap_data[i] == ref;
+    } else {
+      is_close = fabs(ref_f - ofmap_data_f) < epsilon;
+    }
+
+    if (!is_close) {
+      float y = convert_bf16_fp32(ifmap2[i]);
+      float x = convert_bf16_fp32(ifmap[i]);
+      fprintf(stderr,
+              "comparing failed at ofmap_data[%lu]\n"
+              "\tgot %x, exp %x, fp32: got %f exp %f, atan2(%f, %f) = %f"
+              "\ty %f(0x%x), x %f(0x%x)\n",
+              i, ofmap_data_bf16, ref, ofmap_data_f, ref_f, y, x, _gen_atan2(y, x), y, ifmap2[i], x,
+              ifmap[i]);
+
+      if (tolerant_cnt++ >= tolerant_max) {
+        exit(-1);
+      }
+    }
+  }
+
+  return true;
+}
+
+static void _gen_input(uint16_t *input_data, uint64_t ifmap_size, int range_start, int range_end) {
+  std::random_device rd;
+  std::mt19937 e2(rd());
+  std::uniform_real_distribution<> dist(range_start, range_end);
+
+  float LO = pow(2, range_start);
+  float HI = pow(2, range_end);
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    // input range is -8 ~ +8
+    int table_hw = 256;
+    float input =
+        ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
+    input = ((int)i % (range_end - 2)) * (((int)i % 2) ? 1 : 1) + 0.03 + (i % table_hw) * 0.002;
+    input_data[i] = convert_fp32_bf16(input);
+    input = dist(e2);
+    input = LO + static_cast<float>(rand()) / (static_cast<float>(RAND_MAX / (HI - LO)));
+  }
+}
+
+static void gen_input(uint16_t *x, uint16_t *y, uint64_t ifmap_size, TEST_MODE mode,
+                      int range_start, int range_end) {
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(x, &test_pattern, sizeof(test_pattern));
+  } else {
+    range_start = abs(range_start);
+    range_end = abs(range_end);
+    _gen_input(x, ifmap_size, range_start, range_end);
+  }
+
+  // invert for test
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    y[i] = x[(ifmap_size - 1) - i];
+  }
+
+  if (mode == DATA_COMPARE_ACCURACY_X_GT_0) {
+    // y = any
+    uint32_t i = 0;
+    for (; i < ifmap_size / 4; i++) {
+      // y < 0
+      y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
+      y[i + ifmap_size / 4] = convert_fp32_bf16(0);
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_GE_0) {
+    // x < 0 and y >= 0
+    for (uint32_t i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i]));
+    }
+
+    for (uint32_t i = 0; i < ifmap_size / 4; i++) {
+      y[i + ifmap_size / 4] = convert_fp32_bf16(0);
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_LT_0_Y_LT_0) {
+    // x < 0 and y < 0
+    for (uint32_t i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(x[i]));
+      y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_GT_0) {
+    // pi / 2, x = 0 and y > 0
+    for (uint32_t i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(0);
+    }
+  } else if (mode == DATA_COMPARE_ACCURACY_X_0_Y_LT_0) {
+    // -pi / 2, x = 0 and y < 0
+    for (uint32_t i = 0; i < ifmap_size; i++) {
+      x[i] = convert_fp32_bf16(0);
+      y[i] = convert_fp32_bf16(-1 * convert_fp32_bf16(y[i]));
+    }
+  }
+
+#if 1
+
+  if (mode != PRE_DATA_COMPARE_FIX) {
+    int i = 0;
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(1.394531);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(0.394531);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(0.594531);
+    x[i] = convert_fp32_bf16(-10.0);
+    y[i++] = convert_fp32_bf16(6.0);
+    x[i] = convert_fp32_bf16(1.0);
+    y[i++] = convert_fp32_bf16(-1.);
+    x[i] = convert_fp32_bf16(-1.0);
+    y[i++] = convert_fp32_bf16(1.);
+    x[i] = convert_fp32_bf16(0.111816);
+    y[i++] = convert_fp32_bf16(0);
+    x[i] = convert_fp32_bf16(2.031250);
+    y[i++] = convert_fp32_bf16(0.0);
+    x[i] = convert_fp32_bf16(-2.031250);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(-1.394531);
+    y[i++] = convert_fp32_bf16(0.0);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(-6.0);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(-0.394531);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(-0.594531);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(0.0);
+    x[i] = convert_fp32_bf16(-8);
+    y[i++] = convert_fp32_bf16(0);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(3.0);
+    x[i] = convert_fp32_bf16(-1.0);
+    y[i++] = convert_fp32_bf16(-5.0);
+    x[i] = convert_fp32_bf16(-2.484375);
+    y[i++] = convert_fp32_bf16(-7.531250);
+    x[i++] = convert_fp32_bf16(-125.000000);
+    y[i] = convert_fp32_bf16(5.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(19.000000);
+    y[i] = convert_fp32_bf16(1.070312);
+    x[i++] = convert_fp32_bf16(0.498046);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i] = convert_fp32_bf16(424.000);
+    y[i++] = convert_fp32_bf16(-1.00);
+    x[i] = convert_fp32_bf16(2.484375);
+    y[i++] = convert_fp32_bf16(-7.531250);
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(7.531250);
+
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(-7.531250);
+
+    x[i] = convert_fp32_bf16(0);
+    y[i++] = convert_fp32_bf16(0.394531);
+    y[i] = convert_fp32_bf16(-4.000000);
+    x[i++] = convert_fp32_bf16(-64.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-40.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-53.000000);
+    y[i] = convert_fp32_bf16(-9.000000);
+    x[i++] = convert_fp32_bf16(-91.000000);
+    y[i] = convert_fp32_bf16(12.000000);
+    x[i++] = convert_fp32_bf16(-164.000000);
+    y[i] = convert_fp32_bf16(-20.000000);
+    x[i++] = convert_fp32_bf16(-320.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-71.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-155.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-247.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-118.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-54.000000);
+    y[i] = convert_fp32_bf16(-5.000000);
+    x[i++] = convert_fp32_bf16(-392.000000);
+    y[i] = convert_fp32_bf16(-37.000000);
+    x[i++] = convert_fp32_bf16(-520.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-19.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-10.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-2.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-14.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-2.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-21.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-14.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-17.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-17.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-10.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-8.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-14.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-2.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-41.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-69.000000);
+    y[i] = convert_fp32_bf16(4.000000);
+    x[i++] = convert_fp32_bf16(-86.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-41.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-34.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(1.000000);
+    x[i++] = convert_fp32_bf16(-41.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-136.000000);
+    y[i] = convert_fp32_bf16(-3.000000);
+    x[i++] = convert_fp32_bf16(-79.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-38.000000);
+    y[i] = convert_fp32_bf16(5.000000);
+    x[i++] = convert_fp32_bf16(-173.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-78.000000);
+    y[i] = convert_fp32_bf16(-2.000000);
+    x[i++] = convert_fp32_bf16(-60.000000);
+    y[i] = convert_fp32_bf16(3.000000);
+    x[i++] = convert_fp32_bf16(-123.000000);
+    y[i] = convert_fp32_bf16(-9.000000);
+    x[i++] = convert_fp32_bf16(-280.000000);
+    y[i] = convert_fp32_bf16(3.000000);
+    x[i++] = convert_fp32_bf16(-39.000000);
+    y[i] = convert_fp32_bf16(2.000000);
+    x[i++] = convert_fp32_bf16(-524.000000);
+    y[i] = convert_fp32_bf16(11.000000);
+    x[i++] = convert_fp32_bf16(-376.000000);
+    y[i] = convert_fp32_bf16(5.000000);
+    x[i++] = convert_fp32_bf16(-131.000000);
+    y[i] = convert_fp32_bf16(11.000000);
+    x[i++] = convert_fp32_bf16(-324.000000);
+    y[i] = convert_fp32_bf16(9.000000);
+    x[i++] = convert_fp32_bf16(-125.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-92.000000);
+    y[i] = convert_fp32_bf16(-7.000000);
+    x[i++] = convert_fp32_bf16(-233.000000);
+    y[i] = convert_fp32_bf16(10.000000);
+    x[i++] = convert_fp32_bf16(-170.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-4.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-10.000000);
+    y[i] = convert_fp32_bf16(-1.000000);
+    x[i++] = convert_fp32_bf16(-23.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(0.000000);
+    x[i++] = convert_fp32_bf16(-6.000000);
+    y[i] = convert_fp32_bf16(-3.000000);
+    x[i++] = convert_fp32_bf16(-37.000000);
+
+    y[i] = convert_fp32_bf16(-9);
+    x[i++] = convert_fp32_bf16(-1);
+
+    y[i] = convert_fp32_bf16(7.0);
+    x[i++] = convert_fp32_bf16(-1);
+
+    y[i] = convert_fp32_bf16(0);
+    x[i++] = convert_fp32_bf16(-1);
+  }
+#else
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    x[i] = convert_fp32_bf16(5.375000);
+    y[i] = convert_fp32_bf16(2.203125);
+  }
+#endif
+
+#ifdef DBG
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    printf("source[%ld] y %f x %f\n", i, convert_bf16_fp32(y[i]), convert_bf16_fp32(x[i]));
+  }
+#endif /* ifdef DBG */
+}
+
+static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) {
+  // TODO: check more shape / align
+  cvk_chip_info_t chip_info = bmk->info;
+
+  uint32_t input_n = 1;
+  uint32_t input_c = chip_info.npu_num;
+  uint32_t input_h = 16;
+  uint32_t input_w = 16;
+  float epsilon = 0.1;
+  int range_start = -8;
+  int range_end = 8;
+
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    input_h = 4;
+    input_w = 8;
+  }
+
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+
+  cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
+  cvk_tl_shape_t ofmap_shape = ifmap_shape;
+
+  // get lut table shape and size
+  cvk_tl_shape_t table_shape;
+  uint64_t table_bytesize = cvm_lut_tbl_bytesize(bmk, &table_shape, fmt);
+
+  // get input / output size
+  uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
+  uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
+  int data_type_size = bytesize_of_fmt(fmt);
+  uint64_t ifmap_bytesize = ifmap_size * data_type_size;
+
+  uint64_t ofmap_bytesize = ofmap_size * data_type_size;
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    ofmap_bytesize = sizeof(golden_bf16) / sizeof(golden_bf16[0]) * data_type_size;
+  }
+
+  // atan2 was two inputs
+  cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_ifmap2 = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *out = tl_ofmap_bf16;
+
+  // atan buf
+  cvk_tl_t *tl_y0_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_invert_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_pos_neg_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+
+  // reciprocal buf
+  cvk_tl_t *tl_reciprocal_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_reciprocal_table_answer_mantissa = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+
+  // temp buf
+  cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_buf2 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_buf3 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1);
+
+  uint16_t *input_data = (uint16_t *)xmalloc(ifmap_bytesize);
+  uint16_t *input_data2 = (uint16_t *)xmalloc(ifmap_bytesize);
+  uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
+
+  // for reciprocal
+  uint16_t *table_reciprocal_data = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *table_reciprocal_data_mantissa = (uint16_t *)xmalloc(table_bytesize);
+
+  // for atan
+  uint16_t *table_data_atan_y0 = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *table_data_atan_invert = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *table_data_atan_pos_neg = (uint16_t *)xmalloc(table_bytesize);
+
+  // for search '0' index
+  uint16_t *idx_0_table_data = (uint16_t *)xmalloc(table_bytesize);
+
+  // init input / ref
+  // input_data is x, input_data2 is y
+  gen_input(input_data, input_data2, ifmap_size, mode, range_start, range_end);
+  tl_lut_ref(ref_data, input_data, input_data2, ifmap_shape);
+
+  // init lut table
+  cvm_reciprocal_tbl(table_reciprocal_data, table_reciprocal_data_mantissa, &table_shape);
+  cvm_atan_tbl(table_data_atan_y0, NULL, table_data_atan_invert, table_data_atan_pos_neg,
+               &table_shape);
+  cvm_gen_0_tbl(idx_0_table_data, &table_shape);
+
+  // sys->local
+  test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)input_data);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap2, (uint8_t *)input_data2);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer, (uint8_t *)table_reciprocal_data);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_reciprocal_table_answer_mantissa,
+                           (uint8_t *)table_reciprocal_data_mantissa);
+
+  test_put_tensor_g2l_comp(ctx, bmk, tl_y0_buf, (uint8_t *)table_data_atan_y0);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_invert_buf, (uint8_t *)table_data_atan_invert);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_pos_neg_buf, (uint8_t *)table_data_atan_pos_neg);
+
+  cvm_atan2_merge_emit(bmk, tl_ifmap2, tl_ifmap, tl_buf, tl_buf2, tl_buf3, tl_y0_buf, tl_invert_buf,
+                       tl_pos_neg_buf, tl_reciprocal_table_answer,
+                       tl_reciprocal_table_answer_mantissa, OUT tl_ofmap_bf16, fmt);
+
+  uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, out);
+  verify(ofmap_data, ref_data, input_data, input_data2, ifmap_size, epsilon);
+
+  free_tl(bmk, tl_buf3);
+  free_tl(bmk, tl_buf2);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_reciprocal_table_answer_mantissa);
+  free_tl(bmk, tl_reciprocal_table_answer);
+  free_tl(bmk, tl_pos_neg_buf);
+  free_tl(bmk, tl_invert_buf);
+  free_tl(bmk, tl_y0_buf);
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_ifmap2);
+  free_tl(bmk, tl_ifmap);
+
+  free(idx_0_table_data);
+  free(table_data_atan_y0);
+  free(table_data_atan_invert);
+  free(table_data_atan_pos_neg);
+  free(table_reciprocal_data);
+  free(table_reciprocal_data_mantissa);
+  free(input_data);
+  free(ref_data);
+  free(ofmap_data);
+  free(input_data2);
+}
+
+int main() {
+  CVI_RT_HANDLE ctx;
+  cvk_context_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  // for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++)
+  // for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_ACCURACY; i++) {
+  // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_U8; i++) {
+  // for (int i = DATA_COMPARE_ACCURACY; i < DATA_COMPARE_ACCURACY_X_GT_0; i++)
+  // {
+  for (int i = PRE_DATA_COMPARE_FIX; i < DATA_COMPARE_U8; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf("test mode %d...\n", mode);
+    testbench(&ctx, bmk);
+  }
+  printf("pass\n");
+
+  test_exit(&ctx, bmk);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cvimath/tests/cvi1835/bf16_fp32.cpp b/cvimath/tests/cvi1835/bf16_fp32.cpp
new file mode 100644
index 000000000..781dbc3fd
--- /dev/null
+++ b/cvimath/tests/cvi1835/bf16_fp32.cpp
@@ -0,0 +1,148 @@
+// \file mask sample for gt(great than), ge(great equal), eq(equal), lt(less than), le(less equal)
+
+// header include
+#include <assert.h>
+#include <cvimath_internal.h>     // math
+#include <test_cvikernel_util.h>  // kerenl
+
+void init_input(uint16_t *input_data, uint64_t ifmap_size) {
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    input_data[i] = convert_fp32_bf16(i * 1.0);
+  }
+}
+
+void init_ref(uint16_t *input_data, uint32_t *ref_data, uint64_t ifmap_size) {
+  union s {
+    uint16_t int16[2];  // big endian
+    uint32_t int32;
+  };
+  union s _s;
+  for (uint64_t i = 0; i < ifmap_size; i++) {
+    _s.int16[0] = 0;
+    _s.int16[1] = input_data[i];
+    ref_data[i] = _s.int32;
+  }
+}
+
+static void testbench(CVI_RT_HANDLE *rt_ctx, cvk_context_t *cvk_ctx,
+                      cvk_tg_shape_t *bf16_tg_shape) {
+  // for calculate size we need in host
+  cvk_tl_shape_t ifmap_shape = {bf16_tg_shape->n, bf16_tg_shape->c, bf16_tg_shape->h,
+                                bf16_tg_shape->w};
+
+  // * 2 means fp32 takes twice size of bf16
+  cvk_tl_shape_t ofmap_shape = {bf16_tg_shape->n, bf16_tg_shape->c, bf16_tg_shape->h,
+                                bf16_tg_shape->w * 2};
+
+  uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
+  uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
+
+  // unit size is 1 bytes, bf16 takes 2 bytes
+  int data_type_size = 2;
+
+  uint64_t ifmap_bytesize = ifmap_size * data_type_size;
+
+  // * 2 means fp32 takes twice size of bf16
+  uint64_t ofmap_bytesize = ofmap_size * data_type_size * 2;
+
+  uint8_t *input_data = (uint8_t *)xmalloc(ifmap_bytesize);
+  uint8_t *ref_data = (uint8_t *)xmalloc(ofmap_bytesize);
+
+  // init input / output data in ddr
+  init_input((uint16_t *)input_data, ifmap_size);
+  init_ref((uint16_t *)input_data, (uint32_t *)ref_data, ifmap_size);
+
+  // send host memory->device memory
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_tg_shape_t fp32_tg_shape;
+  fp32_tg_shape = {ofmap_shape.n, ofmap_shape.c, ofmap_shape.h, ofmap_shape.w};
+
+  cvk_tg_t *bf16_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, *bf16_tg_shape, fmt);
+  assert(bf16_tg && "alloc bf16 fail");
+
+  test_put_tg_mem_comp(rt_ctx, bf16_tg, (uint8_t *)input_data);
+
+  cvk_tg_t *fp32_tg = test_alloc_tg_mem_comp(rt_ctx, cvk_ctx, fp32_tg_shape, fmt);
+  assert(bf16_tg && "alloc fp32 fail");
+
+  // prepare command buffer
+  cvm_bf16_fp32(cvk_ctx, bf16_tg, fp32_tg);
+
+  // submit descriptor
+  test_submit_comp(rt_ctx, cvk_ctx);
+
+  // get data from tl
+  uint8_t *ofmap_data = test_get_tg_mem_comp(rt_ctx, fp32_tg);
+
+  // compare with reference with byte
+  for (uint32_t i = 0; i < ofmap_size; i++) {
+    if (ref_data[i] != ofmap_data[i]) {
+      fprintf(stderr, "comparing failed output[%u] got %u, ref %u\n", i, ofmap_data[i],
+              ref_data[i]);
+      // fail case
+      exit(-1);
+    }
+  }
+
+  // free resource from tpu memory
+  test_free_tg_mem_comp(rt_ctx, bf16_tg);
+  test_free_tg_mem_comp(rt_ctx, fp32_tg);
+
+  // free resource from host memory
+  free(input_data);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main() {
+  CVI_RT_HANDLE rt_ctx;
+  cvk_context_t *cvk_ctx;
+  int round_mode;
+
+  // align kerenl rounding mode
+  round_mode = set_store_feround();
+
+  // init runtime / kerenl structure
+  test_init(&rt_ctx, &cvk_ctx);
+
+  cvk_tg_shape_t bf16_tg_shape = {1, 2, 3, 4};
+  {
+    // test 1
+    printf("test bf16 <%d,%d,%d,%d> to fp32\n", bf16_tg_shape.n, bf16_tg_shape.c, bf16_tg_shape.h,
+           bf16_tg_shape.w);
+    testbench(&rt_ctx, cvk_ctx, &bf16_tg_shape);
+    printf("compare test bf16 to fp32 done\n");
+  }
+
+  {
+    // test 2
+    bf16_tg_shape = {1, 20, 30, 40};
+    printf("test bf16 <%d,%d,%d,%d> to fp32\n", bf16_tg_shape.n, bf16_tg_shape.c, bf16_tg_shape.h,
+           bf16_tg_shape.w);
+    testbench(&rt_ctx, cvk_ctx, &bf16_tg_shape);
+    printf("compare test bf16 to fp32 done\n");
+  }
+
+  bf16_tg_shape = {40, 40, 128, 256};
+  for (int n = 1; n < (int)bf16_tg_shape.n; n += 10) {
+    for (int c = 1; c < (int)bf16_tg_shape.c; c += 10) {
+      for (int h = 1; h < (int)bf16_tg_shape.h; h += 100) {
+        for (int w = 2; w < (int)bf16_tg_shape.w; w += 100) {
+          printf("test bf16 <%d,%d,%d,%d> to fp32\n", bf16_tg_shape.n, bf16_tg_shape.c,
+                 bf16_tg_shape.h, bf16_tg_shape.w);
+          cvk_tg_shape_t _bf16_tg_shape = {(uint32_t)n, (uint32_t)c, (uint32_t)h, (uint32_t)w};
+          testbench(&rt_ctx, cvk_ctx, &_bf16_tg_shape);
+          printf("compare test bf16 to fp32 done\n");
+        }
+      }
+    }
+  }
+
+  // de-init runtime / kerenl structure
+  test_exit(&rt_ctx, cvk_ctx);
+
+  // restore rounding mode
+  restore_feround(round_mode);
+
+  return 0;
+}
diff --git a/cvimath/tests/cvi1835/blas_cpu.cpp b/cvimath/tests/cvi1835/blas_cpu.cpp
new file mode 100644
index 000000000..d38ce9ca6
--- /dev/null
+++ b/cvimath/tests/cvi1835/blas_cpu.cpp
@@ -0,0 +1,60 @@
+#include <cvimath_internal.h>
+
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+#include <cstdlib>
+#include <iostream>
+
+int main() {
+  srand(time(NULL));
+  const uint32_t data_length = 512;
+  const uint32_t data_num = 20000;
+  uint8_t *db = new uint8_t[data_num * data_length];
+  float *db_unit = new float[data_num];
+  uint8_t *data = new uint8_t[data_length];
+  float *buffer_f = new float[data_num];
+  memset(buffer_f, 0, data_num * sizeof(float));
+
+  for (uint32_t i = 0; i < data_length; i++) {
+    data[i] = rand() % 256;
+  }
+  for (uint32_t j = 0; j < data_num; j++) {
+    for (uint32_t i = 0; i < data_length; i++) {
+      db[j * data_length + i] = rand() % 256;
+    }
+  }
+  cvm_gen_db_unit_length(db, db_unit, data_length, data_num);
+
+  const uint32_t k = 5;
+  uint32_t k_index[k] = {0};
+  float k_value[k] = {0};
+  struct timeval t0, t1;
+  gettimeofday(&t0, NULL);
+  cvm_cpu_u8data_ip_match(data, db, db_unit, k_index, k_value, buffer_f, data_length, data_num, k);
+  gettimeofday(&t1, NULL);
+  unsigned long elapsed_tpu = ((t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec);
+  printf("Searching time uint8: %lu us\n", elapsed_tpu);
+  printf("Result:\n");
+  for (uint32_t i = 0; i < k; i++) {
+    printf("[%u] %f\n", k_index[i], k_value[i]);
+  }
+  printf("\n");
+  gettimeofday(&t0, NULL);
+  cvm_cpu_i8data_ip_match((int8_t *)data, (int8_t *)db, db_unit, k_index, k_value, buffer_f,
+                          data_length, data_num, k);
+  gettimeofday(&t1, NULL);
+  elapsed_tpu = ((t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec);
+  printf("Searching time int8: %lu us\n", elapsed_tpu);
+  printf("Result:\n");
+  for (uint32_t i = 0; i < k; i++) {
+    printf("[%u] %f\n", k_index[i], k_value[i]);
+  }
+  printf("\n");
+
+  delete[] data;
+  delete[] db;
+  delete[] db_unit;
+  delete[] buffer_f;
+  return 0;
+}
\ No newline at end of file
diff --git a/cvimath/tests/cvi1835/blas_tpu.cpp b/cvimath/tests/cvi1835/blas_tpu.cpp
new file mode 100644
index 000000000..82be4a7c2
--- /dev/null
+++ b/cvimath/tests/cvi1835/blas_tpu.cpp
@@ -0,0 +1,134 @@
+#include <cvimath_internal.h>
+#include <cviruntime.h>
+#include <cviruntime_context.h>
+
+#include <string.h>
+#include <sys/time.h>
+#include <time.h>
+#include <cmath>
+#include <cstdlib>
+#include <iostream>
+
+void i8data_ip_match(CVI_RT_HANDLE ctx, cvk_context_t *cvk_ctx, uint64_t a_gaddr, int8_t *a_vaddr,
+                     uint64_t db_gaddr, float *unit_db_arr, uint32_t *k_index, float *k_value,
+                     uint64_t buffer_gemm_gaddr, uint8_t *buffer_gemm_vaddr, uint32_t *buffer_i32,
+                     float *buffer_f, CVI_RT_MEM gemm_device, const uint32_t data_length,
+                     const uint32_t data_num, const uint32_t k) {
+  size_t *slice_num =
+      cvm_gemm(cvk_ctx, a_gaddr, db_gaddr, buffer_gemm_gaddr, 1, data_length, data_num, CVK_FMT_I8);
+  CVI_RT_Submit(cvk_ctx);
+  CVI_RT_MemInvld(ctx, gemm_device);
+  cvm_combin_gemm_i8(slice_num, buffer_gemm_vaddr, buffer_i32, 1, data_num);
+  free(slice_num);
+  // Get a length
+  int32_t dot_result = 0;
+  for (uint32_t i = 0; i < data_length; i++) {
+    dot_result += ((short)a_vaddr[i] * a_vaddr[i]);
+  }
+  float unit_a = sqrt(dot_result);
+  // Get a length end
+
+  for (uint32_t i = 0; i < data_num; i++) {
+    buffer_f[i] = ((int32_t *)buffer_i32)[i] / (unit_a * unit_db_arr[i]);
+  }
+  // Get k result
+  for (uint32_t i = 0; i < k; i++) {
+    int largest = 0;
+    for (uint32_t j = 0; j < data_num; j++) {
+      if (buffer_f[j] > buffer_f[largest]) {
+        largest = j;
+      }
+    }
+    k_value[i] = buffer_f[largest];
+    k_index[i] = largest;
+    buffer_f[largest] = 0;
+  }
+}
+
+int main() {
+  CVI_RT_HANDLE ctx;
+  CVI_RT_Init(&ctx);
+  cvk_context_t *bk_ctx = (cvk_context_t *)CVI_RT_RegisterKernel(ctx, 100000);
+  printf("123\n");
+
+  const uint32_t data_length = 512;
+  const uint32_t data_num = 1000;
+  // Allocate memory
+  CVI_RT_MEM bmmem_a = CVI_RT_MemAlloc(ctx, data_length);
+  CVI_RT_MEM bmmem_db = CVI_RT_MemAlloc(ctx, data_length * data_num);
+  CVI_RT_MEM bmmem_c = CVI_RT_MemAlloc(ctx, data_num * sizeof(uint32_t));
+
+  uint64_t gaddr_a = CVI_RT_MemGetPAddr(bmmem_a);
+  uint64_t gaddr_db = CVI_RT_MemGetPAddr(bmmem_db);
+  uint64_t gaddr_c = CVI_RT_MemGetPAddr(bmmem_c);
+
+  uint8_t *vaddr_a = CVI_RT_MemGetVAddr(bmmem_a);
+  uint8_t *vaddr_db = CVI_RT_MemGetVAddr(bmmem_db);
+  uint8_t *vaddr_c = CVI_RT_MemGetVAddr(bmmem_c);
+
+  int8_t *db_raw = new int8_t[data_length * data_num];
+  float *db_unit = new float[data_num];
+  uint32_t *buffer = new uint32_t[data_num];
+  float *buffer_f = new float[data_num];
+
+  // Generate data
+  srand(time(NULL));
+  for (uint32_t i = 0; i < data_length; i++) {
+    ((int8_t *)vaddr_a)[i] = rand() % 10 - 10;
+  }
+  for (uint32_t j = 0; j < data_num; j++) {
+    for (uint32_t i = 0; i < data_length; i++) {
+      ((int8_t *)db_raw)[j * data_length + i] = rand() % 10 - 10;
+    }
+  }
+
+  // Pass db feature to ion
+  for (uint32_t n = 0; n < data_num * data_length; n++) {
+    int i = n / data_num;
+    int j = n % data_num;
+    ((int8_t *)vaddr_db)[n] = db_raw[data_length * j + i];
+  }
+
+  // Calculate unit length for db feature
+  cvm_gen_precached_i8_unit_length((int8_t *)db_raw, db_unit, data_length, data_num);
+  CVI_RT_MemFlush(ctx, bmmem_a);
+  CVI_RT_MemFlush(ctx, bmmem_db);
+
+  const uint32_t k = 5;
+  uint32_t k_index[k] = {0};
+  float k_value[k] = {0};
+  struct timeval t0, t1;
+  gettimeofday(&t0, NULL);
+  i8data_ip_match(ctx, bk_ctx, gaddr_a, (int8_t *)vaddr_a, gaddr_db, db_unit, k_index, k_value,
+                  gaddr_c, vaddr_c, buffer, buffer_f, bmmem_c, data_length, data_num, k);
+  gettimeofday(&t1, NULL);
+  unsigned long elapsed_tpu = ((t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec);
+  printf("Searching time tpu int8: %lu us\n", elapsed_tpu);
+  printf("Result:\n");
+  for (uint32_t i = 0; i < k; i++) {
+    printf("[%u] %f\n", k_index[i], k_value[i]);
+  }
+  printf("\n");
+
+  gettimeofday(&t0, NULL);
+  cvm_cpu_i8data_ip_match((int8_t *)vaddr_a, (int8_t *)db_raw, db_unit, k_index, k_value, buffer_f,
+                          data_length, data_num, k);
+  gettimeofday(&t1, NULL);
+  elapsed_tpu = ((t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec);
+  printf("Searching time int8: %lu us\n", elapsed_tpu);
+  printf("Result:\n");
+  for (uint32_t i = 0; i < k; i++) {
+    printf("[%u] %f\n", k_index[i], k_value[i]);
+  }
+  printf("\n");
+
+  delete[] db_unit;
+  delete[] buffer;
+  delete[] buffer_f;
+  CVI_RT_MemFree(ctx, bmmem_a);
+  CVI_RT_MemFree(ctx, bmmem_db);
+  CVI_RT_MemFree(ctx, bmmem_c);
+  CVI_RT_UnRegisterKernel(bk_ctx);
+  CVI_RT_DeInit(ctx);
+  return 0;
+}
diff --git a/cvimath/tests/cvi1835/depthwise_reshape_same.cpp b/cvimath/tests/cvi1835/depthwise_reshape_same.cpp
new file mode 100644
index 000000000..b591e5c30
--- /dev/null
+++ b/cvimath/tests/cvi1835/depthwise_reshape_same.cpp
@@ -0,0 +1,907 @@
+#include <cvimath_internal.h>
+#include <test_cvikernel_util.h>
+
+#include <test_native_ref.h>  // calc_dilute_hw
+
+#define NPU_NUM (1 << 5)
+typedef cvk_tiu_depthwise_pt_convolution_param_t param_t;
+
+int random_seed;
+static void print_pooling_param(param_t *p) {
+  int in = p->ifmap->shape.n;
+  int ic = p->ifmap->shape.c;
+  int ih = p->ifmap->shape.h;
+  int iw = p->ifmap->shape.w;
+  int kh = p->weight->shape.h;
+  int kw = p->weight->shape.w;
+
+  printf("  Pooling parameters:\n");
+  // printf("    random_seed : %d \n", random_seed);
+  printf("    ifmap = (%d, %d, %d, %d)\n", in, ic, ih, iw);
+  printf("    opd0_sign = %d\n", p->ifmap->fmt == CVK_FMT_I8);
+  printf("    weight = (%d, %d)\n", kh, kw);
+  printf("    padding = (%d, %d, %d, %d)\n", p->pad_top, p->pad_bottom, p->pad_left, p->pad_right);
+  printf("    stride = (%d, %d)\n", p->stride_h, p->stride_w);
+  // printf("    ins0 = (%d, %d, %d, %d)\n",
+  //       p->ins_h, p->ins_last_h, p->ins_w, p->ins_last_w);
+  // printf("    dilation = (%d, %d)\n",p->dilation_h, p->dilation_w);
+  // printf("    rshift_bits = %d\n", p->rshift_bits);
+  // printf("    relu_enable = %d\n", p->relu_enable);
+  printf("    res0_sign = %d\n", p->ofmap->fmt == CVK_FMT_I8);
+}
+
+static uint16_t *alloc_input(int ic, int ih, int iw, cvk_fmt_t ifmt) {
+  uint64_t size = ic * ih * iw;
+  uint16_t *data = (uint16_t *)new uint16_t[(size)];
+  if (ifmt == CVK_FMT_BF16) {
+    for (uint64_t i = 0; i < size; i++) {
+      float val = 0;
+      int RAND_MAX2 = RAND_MAX / 2;  // 5 ~ -5
+      val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX;
+      val = i;
+      data[i] = convert_fp32_bf16(val);
+    }
+  } else {
+    uint8_t *d = (uint8_t *)data;
+    for (uint64_t i = 0; i < size; i++) {
+      d[i] = i % 10 * (i % 2 ? -1 : 1);
+    }
+  }
+
+  return data;
+}
+
+static uint16_t *alloc_weight(int ic, int kh, int kw, cvk_fmt_t fmt) {
+  int size = ic * kh * kw;
+  uint16_t *data = (uint16_t *)malloc(size * sizeof(uint16_t));
+  // printf("weight size is %d\n", size * 2);
+  if (fmt == CVK_FMT_BF16) {
+    for (int i = 0; i < size; i++) {
+      float val = 0;
+      int RAND_MAX2 = RAND_MAX / 2;  // 5 ~ -5
+      val = (float)(rand() - RAND_MAX2) * 5 / (float)RAND_MAX;
+      val = i;
+      data[i] = convert_fp32_bf16(val);
+    }
+  } else {
+    uint8_t *d = (uint8_t *)data;
+    for (int i = 0; i < size; i++) {
+      d[i] = i % 5 * (i % 2 ? -1 : 1);
+    }
+  }
+  return data;
+}
+
+static uint32_t *alloc_bias(int ic, cvk_fmt_t fmt) {
+  int c = ic;
+  uint64_t size = c;
+  uint32_t *bias = (uint32_t *)malloc(sizeof(uint32_t) * c);
+  if (fmt == CVK_FMT_BF16) {
+    for (int i = 0; i < c; i++) {
+      float val = 0;
+      int RAND_MAX2 = RAND_MAX / 2;  // 2 ~ -2
+      val = (float)(rand() - RAND_MAX2) * 2 / (float)RAND_MAX;
+      val = i;
+      bias[i] = convert_fp32_hex(val);
+    }
+  } else {
+    uint16_t *d = (uint16_t *)bias;
+    for (uint64_t i = 0; i < size; i++) {
+      d[i] = i % 0xf * (i % 2 ? -1 : 1);
+    }
+  }
+  return bias;
+}
+
+static uint16_t *alloc_output(int ic, int oh, int ow) {
+  uint64_t size = ic * oh * ow;
+  return (uint16_t *)new uint16_t[(size)];
+}
+
+static inline void cvm_relu(uint16_t *buf, uint64_t size, cvk_fmt_t fmt) {
+  if (fmt == CVK_FMT_BF16) {
+    for (uint64_t i = 0; i < size; i++)
+      if (convert_bf16_fp32(buf[i]) < 0) buf[i] = convert_fp32_bf16(0);
+  } else {
+    int8_t *buf_int8_t = (int8_t *)buf;
+    for (uint64_t i = 0; i < size; i++) {
+      if (buf_int8_t[i] < 0) buf_int8_t[i] = 0;
+    }
+  }
+}
+
+static int index_get(int h, int w1, int w2) { return h * w1 + w2; }
+
+int native_pooling_avg_bf16(const uint16_t *i_fmap, const void *weight, const uint32_t *bias,
+                            uint16_t *o_fmap, int input_n, int input_c, int input_h, int input_w,
+                            int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+                            int stride_h, int stride_w, int ins_h, int ins_w, int ins_h_last,
+                            int ins_w_last, int dh, int dw, int const_weight) {
+  if (kh * kw <= 0) return BM_ERR_INVALID_ARGUMENT;
+
+  uint16_t avg_const_weight = *(uint16_t *)weight;
+  uint16_t *weight_arr = (uint16_t *)weight;
+  int h_after = calc_dilute_hw(input_h, ins_h, ins_h_last, pad_h_t, pad_h_b);
+  int w_after = calc_dilute_hw(input_w, ins_w, ins_w_last, pad_w_l, pad_w_r);
+  int d_kh = calc_dilute_hw(kh, dh - 1, 0, 0, 0);
+  int d_kw = calc_dilute_hw(kw, dw - 1, 0, 0, 0);
+
+  int output_h = calc_output_hw(h_after, d_kh, stride_h);
+  int output_w = calc_output_hw(w_after, d_kw, stride_w);
+  // printf("output_h/output_w is %d/%d\n", output_h, output_w);
+  float *avg_pooling_mac_a = (float *)malloc(d_kh * d_kw * sizeof(float));
+  float *avg_pooling_mac_b = (float *)malloc(d_kh * d_kw * sizeof(float));
+
+  uint16_t *i_fmap_pad = NULL;
+  uint16_t *i_kmap_pad = NULL;
+  for (int n = 0; n < input_n; n++) {
+    if (const_weight == 0) weight_arr = (uint16_t *)weight;
+
+    for (int c = 0; c < input_c; ++c) {
+      fill_pad_fmap_bf16(i_fmap, &i_fmap_pad, 0, pad_w_l, pad_w_r, pad_h_t, pad_h_b, ins_h, ins_w,
+                         ins_h_last, ins_w_last, input_h, input_w);
+
+      // kernel_dilation(
+      if (const_weight == 0)
+        fill_pad_fmap_bf16((weight_arr), &i_kmap_pad, 0, 0, 0, 0,
+                           0,  // no padding
+                           dh - 1, dw - 1, 0, 0, kh, kw);
+
+      float avg_pool_result;
+      for (int ph = 0; ph < output_h; ++ph) {
+        for (int pw = 0; pw < output_w; ++pw) {
+          int hstart = ph * stride_h;
+          int wstart = pw * stride_w;
+          int pool_index = index_get(ph, output_w, pw);
+          int mac_index = 0;
+
+          float r = 0;
+          for (int h = 0; h < d_kh; h++) {
+            for (int w = 0; w < d_kw; w++) {
+              int index = index_get((hstart + h), w_after, (w + wstart));
+              mac_index = h * d_kw + w;
+
+              avg_pooling_mac_a[mac_index] = convert_bf16_fp32(i_fmap_pad[index]);
+
+              avg_pooling_mac_b[h * d_kw + w] = const_weight
+                                                    ? convert_bf16_fp32(avg_const_weight)
+                                                    : convert_bf16_fp32(i_kmap_pad[mac_index]);
+
+#if 0
+              printf ("ref[ni %u][ci %u][oh/ow %u/%u][kh/kw %u/%u] o[%d]"
+                " %.1f * %.1f + %.1f = %.1f\n",
+                n, c, ph, pw, h, w, pool_index,
+                avg_pooling_mac_a[mac_index], avg_pooling_mac_b[h*d_kw+w],
+                r, r + avg_pooling_mac_a[mac_index] * avg_pooling_mac_b[h*d_kw+w]);
+#endif
+
+              r += avg_pooling_mac_a[mac_index] * avg_pooling_mac_b[h * d_kw + w];
+            }
+          }
+
+          inner_float_product(avg_pooling_mac_a, avg_pooling_mac_b, d_kh * d_kw, &avg_pool_result);
+
+          if (bias) {
+            avg_pool_result += convert_hex_fp32(bias[c]);
+          }
+          *(o_fmap + pool_index) = convert_fp32_bf16(avg_pool_result);
+        }
+      }
+      weight_arr += kh * kw;
+      i_fmap += input_w * input_h;
+      o_fmap += output_w * output_h;
+    }
+  }
+  free(i_fmap_pad);
+  free(i_kmap_pad);
+  free(avg_pooling_mac_a);
+  free(avg_pooling_mac_b);
+
+  return BM_SUCCESS;
+}
+
+static int get_fsz(cvk_fmt_t fmt) {
+  assert(fmt == CVK_FMT_BF16 || fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8);
+  return fmt == CVK_FMT_BF16 ? 2 : 1;
+}
+
+static void compare_results(param_t *p, uint16_t input[], uint16_t weight[], uint32_t bias[],
+                            uint16_t output[], uint16_t output_ref[], uint32_t org_o_shape_size,
+                            int is_valid_pack, int org_oc, int org_oh, int org_ow) {
+  assert(input);
+  assert(weight);
+  (void)input;
+  (void)weight;
+  printf("bias at %p\n", bias);
+  int f_sz = get_fsz(p->ofmap->fmt);
+
+  if (p->relu_enable) {
+    cvm_relu(output_ref, org_o_shape_size, p->ofmap->fmt);
+  }
+
+  int cmp_res = -1;
+  if (!is_valid_pack) {
+    // we reshape c with SAME mode padding with garbage
+    // \is_valid_pack set to false means we skip garbage part
+    int org_hw = org_oh * org_ow;
+    int new_hw = p->ofmap->shape.h * p->ofmap->shape.w;
+    int duplicated_c = p->ofmap->shape.c / org_oc;
+
+    assert(new_hw >= org_hw / duplicated_c);
+
+    int8_t *output_c = ((int8_t *)output);
+    int8_t *output_ref_c = ((int8_t *)output_ref);
+    for (int c = 0; c < org_oc; c++) {
+      cmp_res =
+          array_cmp_int8("Comparing results ...\n", output_c + c * duplicated_c * new_hw * f_sz,
+                         output_ref_c + org_hw * c * f_sz, org_hw * f_sz);
+
+      if (cmp_res != 0) {
+        break;
+      }
+      // printf("compare [%d] pass, org len is %u, new len is %u\n", c,
+      //    org_hw, duplicated_c * new_hw);
+    }
+  } else {
+    cmp_res = array_cmp_int8("Comparing results ...\n", (int8_t *)output_ref, (int8_t *)output,
+                             org_o_shape_size * f_sz);
+  }
+  if (cmp_res != 0) {
+    printf("Comparison FAILED!!!\n");
+    // print_pooling_param(p);
+    exit(-1);
+  }
+
+  delete[] output_ref;
+}
+
+static int pooling_ih_ext(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int ih) {
+  int ins = ins_h;
+  int ins_last = ins_last_h;
+  int pad = pad_top + pad_bottom;
+  return (ih - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_iw_ext(int ins_w, int ins_last_w, int pad_left, int pad_right, int iw) {
+  int ins = ins_w;
+  int ins_last = ins_last_w;
+  int pad = pad_left + pad_right;
+  return (iw - 1) * (ins + 1) + ins_last + 1 + pad;
+}
+
+static int pooling_oh(int ins_h, int ins_last_h, int pad_top, int pad_bottom, int stride_h, int ih,
+                      int kh, int dh) {
+  int ih_ext = pooling_ih_ext(ins_h, ins_last_h, pad_top, pad_bottom, ih);
+  int d_h = (kh - 1) * dh + 1;
+  return (ih_ext - d_h) / stride_h + 1;
+}
+
+static int pooling_ow(int ins_w, int ins_last_w, int pad_left, int pad_right, int stride_w, int iw,
+                      int kw, int dw) {
+  int iw_ext = pooling_iw_ext(ins_w, ins_last_w, pad_left, pad_right, iw);
+  int d_w = (kw - 1) * dw + 1;
+  return (iw_ext - d_w) / stride_w + 1;
+}
+
+static void free_depthwise_struct(param_t *p) {
+  free((void *)p->ofmap);
+  free((void *)p->ifmap);
+  free((void *)p->weight);
+  if (p->bias) {
+    free((void *)p->bias);
+  }
+
+  p->ofmap = NULL;
+  p->ifmap = NULL;
+  p->weight = NULL;
+  p->bias = NULL;
+}
+
+static void free_depthwise_param(cvk_context_t *ctx, param_t *p) {
+  if (p->ofmap) free_tl(ctx, p->ofmap);
+
+  if (p->weight) free_tl(ctx, p->weight);
+
+  if (p->bias) free_tl(ctx, p->bias);
+
+  if (p->ifmap) free_tl(ctx, p->ifmap);
+}
+
+static param_t random_depthwise_param(cvk_context_t *ctx, int _ih, int _iw, int _stride_h,
+                                      cvk_fmt_t _fmt) {
+  param_t p;
+
+  // retry:
+  random_seed = clock();
+  srand(random_seed);
+  int using_bias = rand() % 2;
+  int n = rand() % 5 + 1;
+  n = 1;
+  int c = rand() % (3 * NPU_NUM) + 1;
+  c = 3;
+  int ih = rand() % 30 + 3;
+  int iw = rand() % 30 + 6;
+  int kh = rand() % 7 + 1;
+  int kw = rand() % 7 + 1;
+
+  p.ins_h = rand() % kh;
+  p.ins_w = rand() % kw;
+  p.ins_last_h = rand() % kh;
+  p.ins_last_w = rand() % kw;
+  p.stride_h = rand() % kh + 1;
+  p.stride_w = rand() % kw + 1;
+  p.pad_top = rand() % kh;
+  p.pad_bottom = rand() % kh;
+  p.pad_left = rand() % kw;
+  p.pad_right = rand() % kw;
+  p.rshift_bits = rand() % 32;
+  p.dilation_h = rand() % 4 + 1;
+  p.dilation_w = rand() % 4 + 1;
+
+  // default
+  cvk_fmt_t ifmt = CVK_FMT_BF16;
+  cvk_fmt_t other_fmt = CVK_FMT_BF16;
+  ih = 24;
+  iw = 16;
+  kw = 5;
+  kh = 5;
+  p.stride_h = 1;
+  p.stride_w = 1;
+
+  p.rshift_bits = 0;
+
+  ih = _ih;
+  p.stride_h = _stride_h;
+  iw = _iw;
+  ifmt = _fmt;
+  other_fmt = CVK_FMT_I8;
+  if (ifmt != CVK_FMT_BF16) {
+  } else {
+    other_fmt = CVK_FMT_BF16;
+  }
+
+  p.pad_left = 2;
+  p.pad_right = 2;
+  p.pad_top = 0;
+  p.pad_bottom = 0;
+  // TODO: pad / ins / dilation
+  p.ins_h = 0;
+  p.ins_last_h = 0;
+  p.ins_w = 0;
+  p.ins_last_w = 0;
+  p.dilation_h = 1;
+  p.dilation_w = 1;
+
+  int oh =
+      pooling_oh(p.ins_h, p.ins_last_h, p.pad_top, p.pad_bottom, p.stride_h, ih, kh, p.dilation_h);
+  int ow =
+      pooling_ow(p.ins_w, p.ins_last_w, p.pad_left, p.pad_right, p.stride_w, iw, kw, p.dilation_w);
+
+  cvk_tl_shape_t ofmap_shape;
+  ofmap_shape.n = n;
+  ofmap_shape.c = c;
+  ofmap_shape.h = oh;
+  ofmap_shape.w = ow;
+  cvk_tl_shape_t ifmap_shape;
+  ifmap_shape.n = n;
+  ifmap_shape.c = c;
+  ifmap_shape.h = ih;
+  ifmap_shape.w = iw;
+  cvk_tl_shape_t weight_shape;
+  weight_shape.n = 1;
+  weight_shape.c = c;
+  weight_shape.h = kh;
+  weight_shape.w = kw;
+  cvk_tl_shape_t bias_shape;
+  bias_shape.n = 2;
+  bias_shape.c = c;
+  bias_shape.h = 1;
+  bias_shape.w = 1;
+  p.relu_enable = rand() % 2;
+
+  // fake init for ref
+  cvk_tl_t *bias, *weight, *ofmap, *ifmap;
+  ifmap = (cvk_tl_t *)malloc(sizeof(cvk_tl_t));
+  if (using_bias) {
+    bias = (cvk_tl_t *)malloc(sizeof(cvk_tl_t));
+  }
+  weight = (cvk_tl_t *)malloc(sizeof(cvk_tl_t));
+  ofmap = (cvk_tl_t *)malloc(sizeof(cvk_tl_t));
+
+  p.bias = NULL;
+  if (using_bias) {
+    bias->start_address = -1;
+    bias->fmt = other_fmt;
+    bias->shape = bias_shape;
+    bias->stride = ctx->ops->tl_default_stride(ctx, bias->shape, other_fmt, /*eu_align*/ 0);
+    p.bias = bias;
+  }
+
+  weight->start_address = -1;
+  weight->fmt = other_fmt;
+  weight->shape = weight_shape;
+  weight->stride = ctx->ops->tl_default_stride(ctx, weight->shape, other_fmt, /*align*/ 1);
+  p.weight = weight;
+
+  ofmap->start_address = -1;
+  ofmap->fmt = other_fmt;
+  ofmap->shape = ofmap_shape;
+  ofmap->stride = ctx->ops->tl_default_stride(ctx, ofmap->shape, other_fmt, /*align*/ 1);
+  p.ofmap = ofmap;
+
+  ifmap->start_address = -1;
+  ifmap->fmt = ifmt;
+  ifmap->shape = ifmap_shape;
+  ifmap->stride = ctx->ops->tl_default_stride(ctx, ifmap->shape, ifmt, /*align*/ 1);
+  p.ifmap = ifmap;
+
+#if 0
+  int d_kh = calc_dilute_hw(kh, p.dilation_h - 1, 0, 0, 0);
+  int d_kw = calc_dilute_hw(kw, p.dilation_w - 1, 0, 0, 0);
+  if ((kh > pooling_ih_ext(&p, ih))
+      || (kw > pooling_iw_ext(&p, iw))
+      || (oh < d_kh)
+      || (ow < d_kw)
+      || (p.pad_top >= (1 << 4))
+      || (p.pad_bottom >= (1 << 4))
+      || (p.pad_left >= (1 << 4))
+      || (p.pad_right >= (1 << 4))
+      || !p.ofmap
+      || !p.ifmap
+      || !p.weight
+      || (using_bias && !p.bias)
+) {
+    LOG(INFO) << "retry init_pooling_param";
+    assert(0 && "it MUST valid param pass");
+    goto retry;
+  }
+#endif
+  return p;
+}
+
+static void put_bias_tensor(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx, const cvk_tl_t *tl,
+                            uint32_t data[]) {
+  int c = tl->shape.c;
+
+  uint16_t *hi_lo = (uint16_t *)malloc(sizeof(uint16_t) * 2 * c);
+  if (tl->fmt == CVK_FMT_BF16) {
+    for (int i = 0; i < c; i++) {
+      hi_lo[i] = (data[i] >> 16) & 0xffff;
+      hi_lo[i + c] = (data[i] & 0xffff);
+    }
+  } else {
+    uint8_t *hi_lo_uint8_t = (uint8_t *)hi_lo;
+    uint16_t *data_uint16_t = (uint16_t *)data;
+    for (int i = 0; i < c; i++) {
+      hi_lo_uint8_t[i] = data_uint16_t[i] & 0xff;
+      hi_lo_uint8_t[i + c] = (data_uint16_t[i] >> 8) & 0xff;
+    }
+  }
+  put_bf16_tensor_g2l(ctx, bk_ctx, tl, (uint16_t *)hi_lo, tl->fmt);
+
+  free(hi_lo);
+}
+
+/**
+ * \brief
+ */
+static int reshape_valid_output(cvk_context_t *bk_ctx, const cvk_tl_t *ofmap, int org_oc,
+                                int org_oh, int org_ow, cvk_tl_shape_t *tl_shape,
+                                cvk_tl_stride_t *tl_load_stride, cvk_tg_shape_t *tg_shape,
+                                cvk_tg_stride_t *tg_stride, cvk_fmt_t fmt) {
+  assert(fmt == CVK_FMT_BF16 || fmt == CVK_FMT_I8 || fmt == CVK_FMT_U8);
+
+  // skip redundant one
+  // store to sys and re-slice, maybe use next layer
+  // sys->local skip redundant one
+
+  tg_shape->n = tl_shape->n = 1;
+  tg_shape->c = tl_shape->c = org_oc;
+  tg_shape->h = tl_shape->h = org_oh;
+  tg_shape->w = tl_shape->w = org_ow;
+
+  cvk_tl_stride_t s = bk_ctx->ops->tl_default_stride(bk_ctx, *tl_shape, fmt, /*eu_align*/ 0);
+
+  tl_load_stride->n = s.n;
+  tl_load_stride->c = s.c;
+  tl_load_stride->h = s.h;
+  tl_load_stride->w = s.w;
+
+  int duplicat_c = ofmap->shape.c / org_oc;
+  tg_stride->n = tg_stride->c = duplicat_c * ofmap->shape.h * ofmap->shape.w * get_fsz(fmt);
+  tg_stride->h = org_ow * get_fsz(fmt);
+
+  return 0;
+}
+
+static bmerr_t init_ref(int ic, int ih, int iw, int kh, int kw, int pad_right, int pad_left,
+                        int stride_h, int stride_w, cvk_fmt_t fmt, uint16_t *input,
+                        uint16_t *weight, uint32_t *bias, uint16_t *output_ref) {
+  bmerr_t ret;
+  int in = 1;
+  int ins_h = 0;
+  int ins_w = 0;
+  int ins_last_h = 0;
+  int ins_last_w = 0;
+  int dilation_h = 1;
+  int dilation_w = 1;
+  int pad_top = 0;
+  int pad_bottom = 0;
+  int rshift_bits = 0;
+
+  if (fmt == CVK_FMT_BF16) {
+    ret = native_pooling_avg_bf16(input, weight, bias ? bias : NULL, output_ref, in, ic, ih, iw, kh,
+                                  kw, pad_top, pad_bottom, pad_left, pad_right, stride_h, stride_w,
+                                  ins_h, ins_w, ins_last_h, ins_last_w, dilation_h, dilation_w, 0);
+  } else {
+    int opd0_sign = fmt == CVK_FMT_I8;
+    int res0_sign = true;  //(ofmap->fmt == CVK_FMT_I8);
+    ret = native_pooling_ave_int8((int8_t *)input, (int8_t *)weight, bias ? (int16_t *)bias : NULL,
+                                  (int8_t *)output_ref, in, ic, ih, iw, kh, kw, pad_top, pad_bottom,
+                                  pad_left, pad_right, stride_h, stride_w, ins_h, ins_w, ins_last_h,
+                                  ins_last_w, opd0_sign, res0_sign, rshift_bits, 0);
+  }
+  return ret;
+}
+
+static int test_depthwise(CVI_RT_HANDLE ctx, cvk_context_t *bk_ctx, int ic, int ih, int iw, int kh,
+                          int kw, int pad_right, int pad_left, int stride_h, int stride_w,
+                          bool has_bias, cvk_fmt_t ifmt) {
+  // print_pooling_param(param);
+  param_t param;
+  param_t *p = &param;
+  assert(ifmt == CVK_FMT_BF16 || ifmt == CVK_FMT_I8 || ifmt == CVK_FMT_U8);
+
+  int in = 1;
+  // TODO: verify dialate > 1
+  int dilation_h = 1;
+  int dilation_w = 1;
+  int relu_enable = 0;
+  int rshift_bits = 0;
+
+  // TODO: verity ins_x
+  int org_oh = pooling_oh(0, 0, 0, 0, stride_h, ih, kh, dilation_h);
+  int org_ow = pooling_ow(0, 0, pad_left, pad_right, stride_w, iw, kw, dilation_w);
+  int org_oc = ic;
+  int org_o_shape_size = in * org_oc * org_oh * org_ow;
+  uint16_t *output;
+  cvk_tdma_g2l_tensor_copy_param_t p1;
+  cvk_tdma_l2g_tensor_copy_param_t p2;
+  // weight / ofmap not support U8 format
+  cvk_fmt_t other_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8;
+
+  // alloc testbench, input/ref
+  uint16_t *input = alloc_input(ic, ih, iw, ifmt);
+  uint16_t *weight = alloc_weight(ic, kh, kw, ifmt);
+  uint32_t *bias = NULL;
+  if (has_bias) bias = alloc_bias(ic, ifmt);
+
+  uint16_t *output_ref = alloc_output(ic, org_oh, org_ow);
+
+  // init ref
+  init_ref(ic, ih, iw, kh, kw, pad_right, pad_left, stride_h, stride_w, ifmt, input, weight, bias,
+           output_ref);
+  // assert(ret == BM_SUCCESS);
+
+  // init param
+  // TODO: verify pad_top/pad_bottom
+  // TODO: verify ins_h_x
+  p->pad_left = pad_left;
+  p->pad_right = pad_right;
+  p->pad_top = 0;
+  p->pad_bottom = 0;
+  p->ins_h = 0;
+  p->ins_last_h = 0;
+  p->ins_w = 0;
+  p->ins_last_w = 0;
+  p->dilation_h = dilation_h;
+  p->dilation_w = dilation_w;
+  p->stride_h = stride_h;
+  p->stride_w = stride_w;
+
+  p->relu_enable = relu_enable;
+  p->rshift_bits = rshift_bits;
+  p->bias = NULL;
+
+  // prepard load / input / weight / bias / output new shape / stride
+  cvk_tl_shape_t tl_load_shape;
+  cvk_tl_stride_t tl_load_stride;
+  cvk_tg_shape_t tg_shape;
+  cvk_tg_stride_t tg_stride;
+  cvk_tl_shape_t tl_weight_shape;
+  cvk_tl_shape_t tl_bias_shape;
+  cvk_tl_shape_t tl_output_shape;
+  cvk_tl_t *tmp_tl_load;
+  cvk_tg_t *tmp_tg;
+
+  // get reshaped information
+  int r = cvm_reshape_channel_same(bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left, stride_h,
+                                   stride_w, &tl_load_shape, &tl_load_stride, &tg_shape, &tg_stride,
+                                   &tl_weight_shape, &tl_bias_shape, &tl_output_shape, ifmt,
+                                   /*align*/ 1);
+
+  if (r == -1) {
+    printf("could not reshape it, 81\n");
+    free_depthwise_param(bk_ctx, p);
+
+    delete[] input;
+    free(weight);
+    free(bias);
+    return -1;
+  }
+
+  // prepare input tg
+  {
+    cvk_tg_shape_t put_tg_shape;
+
+    put_tg_shape.n = in;
+    put_tg_shape.c = ic;
+    put_tg_shape.h = ih;
+    put_tg_shape.w = iw;
+    cvk_tg_t *put_tg = alloc_tg_bf16_gmem(&ctx, bk_ctx, put_tg_shape, ifmt);
+    put_tg_bf16_gmem(&ctx, put_tg, (uint8_t *)input);
+    free_tg_gmem(&ctx, put_tg);
+  }
+
+  // prepare load input, put to tg and load back
+  {
+    tmp_tl_load = alloc_tl_bf16(bk_ctx, tl_load_shape, ifmt, /*eu_align*/ 0);
+    assert(tmp_tl_load);
+
+    tmp_tg = alloc_tg_bf16_gmem(&ctx, bk_ctx, tg_shape, ifmt);
+    tmp_tg->stride = tg_stride;
+
+    p1.src = tmp_tg;
+    p1.dst = tmp_tl_load;
+
+    bk_ctx->ops->tdma_g2l_bf16_tensor_copy(bk_ctx, &p1);
+    test_submit_comp(&ctx, bk_ctx);
+    free_tg_gmem(&ctx, tmp_tg);
+
+    // fit for hw
+    tmp_tl_load->stride =
+        bk_ctx->ops->tl_default_stride(bk_ctx, tmp_tl_load->shape, ifmt, /*align*/ 1);
+    p->ifmap = tmp_tl_load;
+  }
+
+  // prepare load bias, put to tg and load back
+  if (has_bias) {
+    // bias must i8
+    cvk_fmt_t bias_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8;
+    p->bias = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_bias_shape, bias_fmt, 0);
+
+    // duplicate bias and replace old
+    uint32_t *new_bias = cvm_reshape_channel_bias((uint8_t *)bias, tl_bias_shape.n, tl_bias_shape.c,
+                                                  tl_bias_shape.h, tl_bias_shape.w, org_oc, ifmt);
+
+    // free old one
+    free(bias);
+    bias = new_bias;
+    put_bias_tensor(&ctx, bk_ctx, p->bias, bias);
+  }
+
+  // prepare load weight, put to tg and load back
+  {
+    p->weight = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_weight_shape, other_fmt, /*align*/ 1);
+    assert(p->weight);
+
+    // duplicate kernel with c
+    uint8_t *new_weight =
+        cvm_reshape_channel_weight((uint8_t *)weight, tl_weight_shape.n, tl_weight_shape.c,
+                                   tl_weight_shape.h, tl_weight_shape.w, org_oc, ifmt);
+
+    // free old one
+    free(weight);
+    weight = (uint16_t *)new_weight;
+    put_bf16_tensor_g2l(&ctx, bk_ctx, p->weight, (uint16_t *)weight, ifmt);
+  }
+
+  // prepard ofmap
+  {
+    // we allocate 'same' mode shape
+    p->ofmap = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, tl_output_shape, other_fmt, /*align*/ 1);
+    assert(p->ofmap);
+  }
+
+  // printf("p->ifmap at %p, c is %d\n", p->ifmap, tmp_tl_load->shape.c);
+
+  // emit
+  if (ifmt == CVK_FMT_BF16) {
+    bk_ctx->ops->tiu_pt_depthwise_convolution(bk_ctx, p);
+  } else {
+    bk_ctx->ops->tiu_pt_depthwise_convolution(bk_ctx, p);
+  }
+
+  // output = (uint16_t *)get_bf16_tensor_l2g(&ctx, bk_ctx, p->ofmap, ifmt);
+
+  // check with no pad if true
+  int is_valid_pack = false;
+  cvk_tl_shape_t r_ofmap_shape;
+  cvk_tl_stride_t r_ofmap_stride;
+  cvk_tg_shape_t r_tg_shape;
+  cvk_tg_stride_t r_tg_stride;
+
+  reshape_valid_output(bk_ctx, p->ofmap, org_oc, org_oh, org_ow, &r_ofmap_shape, &r_ofmap_stride,
+                       &r_tg_shape, &r_tg_stride, ifmt);
+
+  p1.dst = p->ofmap;
+
+  if (is_valid_pack) {
+    cvk_tg_shape_t dst_shape;
+    dst_shape.n = p->ofmap->shape.n;
+    dst_shape.c = p->ofmap->shape.c;
+    dst_shape.h = p->ofmap->shape.h;
+    dst_shape.w = p->ofmap->shape.w;
+    cvk_tg_t *cvk_tg_tmp = alloc_tg_bf16_gmem(&ctx, bk_ctx, dst_shape, ifmt);
+
+    p2.src = p->ofmap;
+    p2.dst = cvk_tg_tmp;
+
+    // store for later reshape
+    bk_ctx->ops->tdma_l2g_bf16_tensor_copy(bk_ctx, &p2);
+    test_submit_comp(&ctx, bk_ctx);
+
+    // free useless for later reallocate
+    free_depthwise_param(bk_ctx, p);
+
+    p->ofmap = bk_ctx->ops->lmem_alloc_tensor(bk_ctx, r_ofmap_shape, ifmt,
+                                              /*eu_align*/ 0);
+    assert(p->ofmap);
+
+    cvk_tg_tmp->shape = r_tg_shape;
+    cvk_tg_tmp->stride = r_tg_stride;
+
+    p1.src = cvk_tg_tmp;
+    p1.dst = p->ofmap;
+    bk_ctx->ops->tdma_g2l_bf16_tensor_copy(bk_ctx, &p1);
+    free_tg_gmem(&ctx, cvk_tg_tmp);
+  }
+
+  cvk_fmt_t ofmap_fmt = ifmt == CVK_FMT_BF16 ? CVK_FMT_BF16 : CVK_FMT_I8;
+  output = (uint16_t *)get_bf16_tensor_l2g(&ctx, bk_ctx, p1.dst, ofmap_fmt);
+  compare_results(p, input, weight, bias, output, output_ref, org_o_shape_size, is_valid_pack,
+                  org_oc, org_oh, org_ow);
+
+  // free resource
+  if (is_valid_pack) {
+    free_tl(bk_ctx, p->ofmap);
+  } else {
+    free_depthwise_param(bk_ctx, p);
+  }
+
+  delete[] input;
+  free(weight);
+  free(bias);
+  free(output);
+
+  return 1;
+}
+
+static void init_input(param_t *p, int *ic, int *ih, int *iw, int *kh, int *kw, int *pad_right,
+                       int *pad_left) {
+  *ic = p->ifmap->shape.c;
+  *ih = p->ifmap->shape.h;
+  *iw = p->ifmap->shape.w;
+  *kh = p->weight->shape.h;
+  *kw = p->weight->shape.w;
+  *pad_right = p->pad_right;
+  *pad_left = p->pad_left;
+}
+
+static int test_depthwise_pooling(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx) {
+  int loop = 1;
+  int test_finished_num = 0;
+  int ihs[] = {24, 96, 120, 480, 0};
+  int iws[] = {16, 17, 19, 23, 128, 256, 0};
+  int stride_hs[] = {3, 4, 0};
+  cvk_fmt_t formats[] = {CVK_FMT_I8, CVK_FMT_U8, CVK_FMT_BF16, CVK_FMT_F32};
+  int ic, ih, iw, kh, kw, pad_right, pad_left;
+  cvk_fmt_t ifmt;
+  param_t param;
+  assert(print_pooling_param);
+
+  ifmt = CVK_FMT_U8;
+  param = random_depthwise_param(bk_ctx, 210, 640, 1, ifmt);
+  init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+  test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
+                                      param.stride_h, param.stride_w, param.bias, ifmt);
+  print_pooling_param(&param);
+  free_depthwise_struct(&param);
+
+#if 1
+  param = random_depthwise_param(bk_ctx, 36, 11, 3, ifmt);
+  init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+  test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
+                                      param.stride_h, param.stride_w, param.bias, ifmt);
+  print_pooling_param(&param);
+  free_depthwise_struct(&param);
+
+  ifmt = CVK_FMT_U8;
+  param = random_depthwise_param(bk_ctx, 24, 29, 3, ifmt);
+  init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+  free_depthwise_struct(&param);
+  test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
+                                      param.stride_h, param.stride_w, param.bias, ifmt);
+
+  ifmt = CVK_FMT_BF16;
+  param = random_depthwise_param(bk_ctx, 480, 53, 3, ifmt);
+  init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+  free_depthwise_struct(&param);
+  test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
+                                      param.stride_h, param.stride_w, param.bias, ifmt);
+
+  ifmt = CVK_FMT_I8;
+  param = random_depthwise_param(bk_ctx, 480, 61, 3, ifmt);
+  init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+  free_depthwise_struct(&param);
+  test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
+                                      param.stride_h, param.stride_w, param.bias, ifmt);
+
+  ifmt = CVK_FMT_U8;
+  param = random_depthwise_param(bk_ctx, 24, 17, 3, ifmt);
+  init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+  free_depthwise_struct(&param);
+  test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
+                                      param.stride_h, param.stride_w, param.bias, ifmt);
+
+  ifmt = CVK_FMT_BF16;
+  param = random_depthwise_param(bk_ctx, 48, 65, 3, ifmt);
+  init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+  free_depthwise_struct(&param);
+  test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
+                                      param.stride_h, param.stride_w, param.bias, ifmt);
+
+  ifmt = CVK_FMT_I8;
+  param = random_depthwise_param(bk_ctx, 48, 63, 3, ifmt);
+  init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+  free_depthwise_struct(&param);
+  test_finished_num += test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
+                                      param.stride_h, param.stride_w, param.bias, ifmt);
+#endif
+
+  for (int i = 0; i < loop; i++) {
+    for (int i = 0; ihs[i] != 0; i++) {
+      for (int j = 0; iws[j] != 0; j++) {
+        for (int k = 0; stride_hs[k] != 0; k++) {
+          for (int l = 0; formats[l] != 0; l++) {
+            continue;
+            if (ihs[i] >= 480 && formats[l] == CVK_FMT_BF16) {
+              continue;
+            }
+            param = random_depthwise_param(bk_ctx, ihs[i], iws[j], stride_hs[k], formats[l]);
+            ifmt = formats[l];
+            printf("test[%d] ih/iw/sh/fmt is {%d, %d, %d, %d}\n", test_finished_num, ihs[i], iws[j],
+                   stride_hs[k], formats[l]);
+
+            init_input(&param, &ic, &ih, &iw, &kh, &kw, &pad_right, &pad_left);
+            free_depthwise_struct(&param);
+            int r = test_depthwise(*ctx, bk_ctx, ic, ih, iw, kh, kw, pad_right, pad_left,
+                                   param.stride_h, param.stride_w, param.bias, ifmt);
+            test_finished_num += r;
+          }
+        }
+      }
+    }
+  }
+  printf("Test finished %u\n", test_finished_num);
+
+  return test_finished_num;
+}
+
+int main() {
+  CVI_RT_HANDLE ctx;
+  cvk_context_t *bk_ctx;
+
+  test_init(&ctx, &bk_ctx);
+
+  int round_mode;
+  round_mode = set_store_feround();
+  int ret = test_depthwise_pooling(&ctx, bk_ctx);
+  assert(ret >= 0);
+  (void)ret;
+  printf("pass\n");
+
+  test_exit(&ctx, bk_ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cvimath/tests/cvi1835/fp32_bf16.cpp b/cvimath/tests/cvi1835/fp32_bf16.cpp
new file mode 100644
index 000000000..477d8402b
--- /dev/null
+++ b/cvimath/tests/cvi1835/fp32_bf16.cpp
@@ -0,0 +1,127 @@
+#include <cvimath_internal.h>
+#include <sys/time.h>
+#include <test_cvikernel_util.h>
+
+typedef cvk_tdma_g2g_tensor_copy_param_t param_t;
+
+static void __print_param(const char *tag, FILE *f, param_t *p) {
+  fprintf(f, "%s: (%u, %u, %u, %u) => (%u, %u, %u, %u)\n", tag, p->src->shape.n, p->src->shape.c,
+          p->src->shape.h, p->src->shape.w, p->dst->shape.n, p->dst->shape.c, p->dst->shape.h,
+          p->dst->shape.w);
+}
+
+#define print_param(f, p) __print_param(__func__, f, p)
+
+typedef struct {
+  cvk_tg_shape_t src_shape;
+  cvk_tg_shape_t dst_shape;
+} case_t;
+
+static cvk_fmt_type input_fmt[] = {
+    {CVK_FMT_BF16, CVK_FMT_BF16},
+};
+
+static case_t g_cases[] = {
+    {
+        {1, 3, 3, 2},
+        {1, 3, 3, 2},
+    },
+    {
+        {4, 3, 3, 2},
+        {4, 3, 3, 2},
+    },
+
+    //{
+    //  // YOLOv2 concat layer
+    //  {1, 256, 19, 19},
+    //  {1, 256, 19, 19},
+    //},
+    {
+        {1, 256, 19, 20},
+        {1, 256, 19, 20},
+    },
+    {
+        {1, 1280, 3, 4},
+        {1, 1280, 3, 4},
+    },
+    {
+        {1, 159 * 89, 36, 4},
+        {1, 159 * 89, 36, 4},
+    },
+    {
+        {159, 89, 36, 4},
+        {159, 89, 36, 4},
+    },
+};
+
+static void test_param_g2g(CVI_RT_HANDLE *ctx, cvk_context_t *bmk, param_t *p) {
+  print_param(stderr, p);
+
+  // 2 means source is fp32, occupy 2 * bf16 size
+  uint64_t size = p->src->shape.n * p->src->shape.c * p->src->shape.h * p->src->shape.w / 2;
+  uint32_t *src_data = new uint32_t[size];
+  for (uint64_t i = 0; i < size; i++) {
+    src_data[i] = ((0x1234 + i) << 16) + 0x5678 + i;
+    // printf("src[%lu] 0x%x\n", i, src_data[i]);
+  }
+
+  test_put_tg_mem_comp(ctx, p->src, (uint8_t *)src_data);
+
+  cvm_s2s_fp32_bf16(bmk, p->src->start_address, p->src->shape, p->dst->start_address, p->dst->shape,
+                    CVK_FMT_BF16);
+
+  long elapsed;
+  struct timeval t0, t1;
+  gettimeofday(&t0, NULL);
+
+  test_submit_comp(ctx, bmk);
+
+  gettimeofday(&t1, NULL);
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+  printf("kernel takes %ld us\n", elapsed);
+
+  uint16_t *dst_data = (uint16_t *)test_get_tg_mem_comp(ctx, p->dst);
+
+  for (uint64_t i = 0; i < size; i++) {
+    uint16_t _src_data = (src_data[i] >> 16) & 0xffff;
+    if (dst_data[i] != _src_data) {
+      fprintf(stderr, "comparing failed at dst[%lu], got %x, exp %x\n", i, dst_data[i], _src_data);
+      exit(-1);
+    }
+  }
+
+  delete[] src_data;
+  free(dst_data);
+}
+
+static void destroy_param_g2g(CVI_RT_HANDLE *ctx, param_t *p) {
+  test_free_tg_mem_comp(ctx, p->src);
+  test_free_tg_mem_comp(ctx, p->dst);
+}
+
+static void test_one_case(CVI_RT_HANDLE *ctx, cvk_context_t *bmk, case_t *c) {
+  uint32_t nr_fmt = sizeof(input_fmt) / sizeof(input_fmt[0]);
+  for (uint32_t i = 0; i < nr_fmt; i++) {
+    param_t p;
+    cvk_tg_t *src, *dst;
+    src = test_alloc_tg_mem_comp(ctx, bmk, c->src_shape, input_fmt[i].src_fmt);
+    dst = test_alloc_tg_mem_comp(ctx, bmk, c->dst_shape, input_fmt[i].dst_fmt);
+    p.src = src;
+    p.dst = dst;
+    test_param_g2g(ctx, bmk, &p);
+    destroy_param_g2g(ctx, &p);
+  }
+}
+
+int main() {
+  CVI_RT_HANDLE ctx;
+  cvk_context_t *bmk;
+
+  test_init(&ctx, &bmk);
+
+  uint32_t nr_cases = sizeof(g_cases) / sizeof(g_cases[0]);
+  for (uint32_t i = 0; i < nr_cases; i++) test_one_case(&ctx, bmk, &g_cases[i]);
+
+  test_exit(&ctx, bmk);
+  return 0;
+}
diff --git a/cvimath/tests/cvi1835/gemm.cpp b/cvimath/tests/cvi1835/gemm.cpp
new file mode 100644
index 000000000..82254b171
--- /dev/null
+++ b/cvimath/tests/cvi1835/gemm.cpp
@@ -0,0 +1,845 @@
+#include <cvimath_internal.h>
+#include <sys/time.h>
+#include <test_cvikernel_util.h>
+#include <time.h>  // clock
+
+typedef cvk_tiu_matrix_multiplication_param_t param_t;
+int random_seed;
+
+static uint64_t matrix_size(const cvk_ml_t *ml) {
+  uint64_t row = ml->shape.n;
+  uint64_t col = ml->shape.col;
+  return row * col;
+}
+
+static uint64_t res_size(param_t *p) { return matrix_size(p->res); }
+
+static uint16_t *alloc_left(param_t *p) {
+  uint64_t size = matrix_size(p->left);
+  uint16_t *buf = new uint16_t[size];
+  for (uint64_t i = 0; i < size; i++) {
+    buf[i] = convert_fp32_bf16(i);
+  }
+
+  return buf;
+}
+
+static uint16_t *alloc_right(param_t *p) {
+  uint64_t size = matrix_size(p->right);
+  uint16_t *buf = new uint16_t[size];
+  for (uint64_t i = 0; i < size; i++) {
+    float val = 0.01;
+    buf[i] = convert_fp32_bf16(i);
+    val += 0.01;
+  }
+  return buf;
+}
+
+static uint32_t *alloc_bias(param_t *p) {
+  if (!p->bias) return NULL;
+
+  uint64_t size = matrix_size(p->bias);
+  uint32_t *buf = new uint32_t[size];
+  for (uint64_t i = 0; i < size; i++) {
+    buf[i] = convert_fp32_hex(i);
+  }
+  return buf;
+}
+
+static uint32_t *alloc_res(param_t *p) {
+  uint64_t size = res_size(p);
+  uint32_t *buf = new uint32_t[size];
+  for (uint64_t i = 0; i < size; i++) {
+    buf[i] = convert_fp32_bf16(i);
+  }
+  return buf;
+}
+
+static inline void cvm_relu(float *buf, uint64_t size) {
+  for (uint64_t i = 0; i < size; i++)
+    if (buf[i] < 0) buf[i] = 0;
+}
+
+static void matrix_mac_ref(param_t *p, uint16_t left[], uint16_t right[], uint32_t bias[],
+                           uint32_t res[]) {
+  uint64_t size = res_size(p);
+  uint32_t left_col = p->left->shape.col;
+  uint32_t right_col = p->right->shape.col;
+  uint32_t res_row = p->left->shape.n;
+  uint32_t res_col = p->res->shape.col;
+  uint32_t left_c = p->left->shape.c;
+  uint32_t left_w = p->left->shape.w;
+
+  float *tmp_res = new float[size];
+  if (p->add_result) {
+    for (uint32_t i = 0; i < res_row * res_col; i++) tmp_res[i] = convert_bf16_fp32(res[i]);
+  } else {
+    for (uint32_t i = 0; i < res_row * res_col; i++) tmp_res[i] = 0;
+  }
+  for (uint32_t row = 0; row < res_row; row++) {
+    for (uint32_t col = 0; col < res_col; col++) {
+      for (uint32_t wi = 0; wi < left_w; wi++) {
+        for (uint32_t ci = 0; ci < left_c; ci++) {
+          if ((wi + (ci * left_w)) >= left_col) continue;
+          uint32_t li = row * left_col + left_w * ci + wi;
+          uint32_t ri = (ci * left_w + wi) * right_col + col;
+
+          float l = convert_bf16_fp32(left[li]);
+          float r = convert_bf16_fp32(right[ri]);
+          tmp_res[row * res_col + col] += l * r;
+        }
+      }
+    }
+  }
+
+  if (p->bias) {
+    for (uint32_t row = 0; row < res_row; row++) {
+      for (uint32_t col = 0; col < res_col; col++) {
+        float b = convert_hex_fp32(bias[col]);
+        tmp_res[row * res_col + col] += b;
+      }
+    }
+  }
+
+  if (p->relu_enable) cvm_relu(tmp_res, size);
+
+  for (uint64_t i = 0; i < size; i++) {
+    res[i] = convert_fp32_bf16(tmp_res[i]);
+  }
+  delete[] tmp_res;
+}
+
+static void put_bias(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx, const cvk_ml_t *ml,
+                     uint32_t data[]) {
+  uint64_t size = ml->shape.col;
+
+  uint16_t *tmp = new uint16_t[size * 2];
+  for (uint64_t i = 0; i < size; i++) {
+    tmp[i] = (data[i] >> 16) & 0xFFFF;
+    tmp[i + size] = (data[i] & 0xFFFF);
+  }
+
+  test_put_matrix_g2l_comp(ctx, bk_ctx, ml, (uint8_t *)tmp);
+
+  delete[] tmp;
+}
+
+static void put_res(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx, const cvk_ml_t *ml,
+                    uint32_t data[]) {
+  uint64_t size = ml->shape.n * ml->shape.col;
+
+  uint16_t *tmp = new uint16_t[size];
+  for (uint64_t i = 0; i < size; i++) {
+    tmp[i] = (data[i] & 0xFFFF);
+  }
+
+  test_put_matrix_g2l_comp(ctx, bk_ctx, ml, (uint8_t *)tmp);
+
+  delete[] tmp;
+}
+
+static uint32_t *get_res(CVI_RT_HANDLE *ctx, cvk_mg_t *mg, param_t *p) {
+  uint64_t size = res_size(p);
+  uint32_t *res = new uint32_t[size];
+
+  uint16_t *tmp = (uint16_t *)test_get_mg_mem_comp(ctx, mg);
+  for (uint64_t i = 0; i < size; i++) res[i] = tmp[i];
+
+  delete[] tmp;
+  return res;
+}
+
+static inline cvk_mg_t *put_bf16_matrix_g(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx,
+                                          const cvk_ml_t *ml, uint8_t data[],
+                                          cvk_fmt_t mg_data_format) {
+  cvk_mg_shape_t s;
+  s.row = ml->shape.n;
+  s.col = ml->shape.col;
+  cvk_mg_t *mg = test_alloc_mg_mem_comp(ctx, s, mg_data_format);
+
+  test_put_mg_mem_comp(ctx, mg, data);
+  test_submit_comp(ctx, bk_ctx);
+
+  return mg;
+}
+
+static void test_param(CVI_RT_HANDLE *ctx, cvk_context_t *bk_ctx, param_t *p) {
+  uint16_t *left = alloc_left(p);
+  uint16_t *right = alloc_right(p);
+  uint32_t *bias = alloc_bias(p);
+  uint32_t *ref = alloc_res(p);
+
+  cvk_mg_t *left_mg = put_bf16_matrix_g(ctx, bk_ctx, p->left, (uint8_t *)left, CVK_FMT_BF16);
+  cvk_mg_t *right_mg = put_bf16_matrix_g(ctx, bk_ctx, p->right, (uint8_t *)right, CVK_FMT_BF16);
+  cvk_mg_shape_t s;
+  s.row = p->res->shape.n;
+  s.col = p->res->shape.col;
+  cvk_mg_t *result_mg = test_alloc_mg_mem_comp(ctx, s, CVK_FMT_BF16);
+
+  if (bias) put_bias(ctx, bk_ctx, p->bias, bias);
+  if (p->add_result) put_res(ctx, bk_ctx, p->res, ref);
+
+  printf("start\n");
+  size_t *slice_num =
+      cvm_gemm(bk_ctx, left_mg->start_address, right_mg->start_address, result_mg->start_address,
+               p->left->shape.n, p->left->shape.col, p->res->shape.col, CVK_FMT_BF16);
+  free(slice_num);  // no need use in bf16
+  test_submit_comp(ctx, bk_ctx);
+
+  uint32_t *res = get_res(ctx, result_mg, p);
+  matrix_mac_ref(p, left, right, bias, ref);
+
+  uint64_t size = res_size(p);
+  for (uint64_t i = 0; i < size; i++) {
+    if (res[i] != ref[i]) {
+      uint16_t _res = res[i] & 0xffff;
+      uint16_t _ref = ref[i] & 0xffff;
+      fprintf(stderr, "comparing failed at out[%lu], got %f(0x%x), exp %f(0x%x)\n", i,
+              convert_bf16_fp32(_res), res[i], convert_bf16_fp32(_ref), ref[i]);
+      fprintf(stderr, "random_seed=%d\n", random_seed);
+      exit(-1);
+    }
+  }
+
+  test_free_mg_mem_comp(ctx, left_mg);
+  test_free_mg_mem_comp(ctx, right_mg);
+  test_free_mg_mem_comp(ctx, result_mg);
+
+  delete[] left;
+  delete[] right;
+  delete[] bias;
+  delete[] res;
+}
+
+static void destroy_param(cvk_context_t *bk_ctx, param_t *p) {
+  if (p->bias) bk_ctx->ops->lmem_free_matrix(bk_ctx, p->bias);
+  if (p->res) bk_ctx->ops->lmem_free_matrix(bk_ctx, p->res);
+  if (p->right) bk_ctx->ops->lmem_free_matrix(bk_ctx, p->right);
+  if (p->left) bk_ctx->ops->lmem_free_matrix(bk_ctx, p->left);
+}
+
+static cvk_ml_t *alloc_param_res(cvk_context_t *bk_ctx, param_t *p) {
+  cvk_ml_shape_t s;
+
+  s.n = p->left->shape.n;
+  s.c = p->right->shape.c;
+  s.w = p->right->shape.w;
+  s.col = p->right->shape.col;
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  cvk_ml_shape_t fake;
+  fake.n = 1;
+  fake.c = 1;
+  fake.w = 1;
+  fake.col = 1;
+  cvk_ml_t *t = bk_ctx->ops->lmem_alloc_matrix(bk_ctx, fake, fmt, 1);
+  t->shape = s;
+  return t;
+}
+
+static param_t param_0(cvk_context_t *bk_ctx) {
+retry:
+  random_seed = clock();
+  srand(random_seed);
+
+  param_t p;
+  memset(&p, 0, sizeof(p));
+  p.lshift_bits = 0;
+  p.rshift_bits = 0;
+  p.res_is_int8 = true;
+  p.relu_enable = rand() % 2;
+  p.relu_enable = 0;
+  p.add_result = 0; /*bf16 HW does not support add_result*/
+  p.ps32_mode = 0;
+
+  uint32_t left_row = rand() % 100 + 1;
+  uint32_t left_col = rand() % 100 + 1;
+  left_row = 1024;
+  left_col = 1024;
+  uint32_t left_w = rand() % (left_col / 5 + 1) + 1;  // c is generate by w, and make c is larger
+  uint32_t left_c = left_col / left_w + (left_col % left_w ? 1 : 0);
+
+  uint32_t right_row = left_col;
+  uint32_t right_col = rand() % 100 + 1;
+  right_col = 1024;
+  uint32_t right_w = (rand() % (right_col / 5 + 1) + 1);  // make c is larger
+  uint32_t right_c = right_col / right_w + (right_col % right_w ? 1 : 0);
+
+  cvk_ml_shape_t left_shape;
+  left_shape.n = left_row;
+  left_shape.c = left_c;
+  left_shape.w = left_w;
+  left_shape.col = left_col;
+
+  cvk_ml_shape_t right_shape;
+  right_shape.n = right_row;
+  right_shape.c = right_c;
+  right_shape.w = right_w;
+  right_shape.col = right_col;
+
+  uint32_t bias = rand() % 2;
+  bias = 0;
+  p.bias = NULL;
+
+  cvk_ml_shape_t fake;
+  fake.n = 1;
+  fake.c = 1;
+  fake.w = 1;
+  fake.col = 1;
+
+  cvk_ml_t *t = bk_ctx->ops->lmem_alloc_matrix(bk_ctx, fake, CVK_FMT_BF16, 1);
+  t->shape = left_shape;
+  p.left = t;
+
+  t = bk_ctx->ops->lmem_alloc_matrix(bk_ctx, fake, CVK_FMT_BF16, 1);
+  t->shape = right_shape;
+  p.right = t;
+  if (!p.left || !p.right) {
+    printf("retry init_matrix_param\n");
+    destroy_param(bk_ctx, &p);
+    goto retry;
+  }
+
+  p.res = alloc_param_res(bk_ctx, &p);
+  if (bias) {
+    cvk_ml_shape_t bias_shape = right_shape;
+    bias_shape.n = 2;
+    p.bias = bk_ctx->ops->lmem_alloc_matrix(bk_ctx, bias_shape, CVK_FMT_BF16, 1);
+  }
+
+  if (!p.res || (bias && !p.bias)) {
+    printf("retry init_matrix_param\n");
+    destroy_param(bk_ctx, &p);
+    goto retry;
+  }
+
+  return p;
+}
+
+// gemm test function
+//#define USE_CBLAS_VERITY (1)
+
+#ifdef USE_CBLAS_VERITY
+#include <cblas.h>
+#endif /* ifdef USE_CBLAS_VERITY */
+
+// comes from
+// https://stackoverflow.com/questions/47023651/multiplying-matrices-in-one-dimensional-arrays
+void multiply(uint16_t *a, int row1, int col1, uint16_t *b, int row2, int col2, uint16_t *d) {
+  assert(col1 == row2);
+  // silence error=unused-but-set-parameter warning
+  (void)row2;
+
+  for (int i = 0; i < row1; i++) {
+    for (int j = 0; j < col2; j++) {
+      float sum = 0;
+      for (int k = 0; k < col1; k++) {
+        float _a = convert_bf16_fp32(a[i * col1 + k]);
+        float _b = convert_bf16_fp32(b[k * col2 + j]);
+        sum = sum + _a * _b;
+      }
+      d[i * col2 + j] = convert_fp32_bf16(sum);
+    }
+  }
+
+#if 0
+    for (int i = 0; i < size; i++) {
+        if (i % col2 == 0) {
+            printf("\n");
+        }
+        printf("%f ", convert_bf16_fp32(d[i]));
+    }
+#endif
+}
+
+#ifdef USE_CBLAS_VERITY
+#else
+static void multiply_i32(uint8_t *a, int row1, int col1, uint8_t *b, int row2, int col2,
+                         uint32_t *d, cvk_fmt_t fmt) {
+  assert(col1 == row2);
+  // silence error=unused-but-set-parameter warning
+  (void)row2;
+
+  for (int i = 0; i < row1; i++) {
+    for (int j = 0; j < col2; j++) {
+      int sum = 0;
+      for (int k = 0; k < col1; k++) {
+        int _a = fmt == CVK_FMT_I8 ? (int8_t)(a[i * col1 + k]) : (a[i * col1 + k]);
+        int _b = fmt == CVK_FMT_I8 ? (int8_t)(b[k * col2 + j]) : (b[k * col2 + j]);
+        // printf("sum = sum + _a * _b = %d = %d + %d * %d\n", sum + _a * _b, sum, _a, _b);
+        sum = sum + _a * _b;
+      }
+      // printf("out [%d] is %d\n", i * col2 + j, sum);
+      d[i * col2 + j] = (sum);
+    }
+  }
+
+#if 0
+    for (int i = 0; i < size; i++) {
+        if (i % col2 == 0) {
+            printf("\n");
+        }
+        printf("%f ", convert_bf16_fp32(d[i]));
+    }
+#endif
+}
+#endif /* ifdef USE_CBLAS_VERITY */
+
+int array_cmp_int16(const char *const info, const uint16_t *p_exp, const uint16_t *p_got,
+                    int count) {
+  int idx;
+  for (idx = 0; idx < count; idx++) {
+    if (p_exp[idx] != p_got[idx]) {
+      printf("%s error at index %d exp %d(%f,0x%x) got %d(%f,0x%x)\n", info, idx, p_exp[idx],
+             convert_bf16_fp32(p_exp[idx]), p_exp[idx], p_got[idx], convert_bf16_fp32(p_got[idx]),
+             p_got[idx]);
+      return -1;
+    }
+  }
+  return 0;
+}
+
+int array_cmp_int32(const char *const info, const uint32_t *p_exp, const uint32_t *p_got,
+                    int count) {
+  int idx;
+  for (idx = 0; idx < count; idx++) {
+    if (p_exp[idx] != p_got[idx]) {
+      printf("%s error at index %d exp %d got %d\n", info, idx, p_exp[idx], p_got[idx]);
+      return -1;
+    }
+  }
+  return 0;
+}
+
+static void assign_bf16_values_to_matrix(uint16_t *matrix, size_t size) {
+  float t;
+  for (size_t i = 0; i < size; i++) {
+    float f;
+#if 1
+    if (i % 2 == 0) t = i % 8;
+    if (i % 2 == 1) t = -1 * (i % 8);
+    f = t;
+#else
+    t = i * (i % 2 ? -1 : 1);
+    f = t * 0.01 + size * 0.01;
+#endif
+    matrix[i] = convert_fp32_bf16(f);
+    // printf("f[%lu] is %f(0x%x)\n", i, f, matrix[i]);
+  }
+}
+
+static void uint16_to_float(float *float_data, uint16_t *bf16_data, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    float_data[i] = convert_bf16_fp32(bf16_data[i]);
+  }
+}
+
+static void uint8_to_float(float *float_data, uint8_t *i8_data, size_t size, cvk_fmt_t fmt) {
+  for (size_t i = 0; i < size; i++) {
+    int input = (i8_data[i]);
+    if (fmt == CVK_FMT_I8) {
+      input = (int8_t)(i8_data[i]);
+    }
+    float_data[i] = (float)input;
+  }
+}
+
+static void assign_i8_values_to_matrix(uint8_t *matrix, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    matrix[i] = i + 20;
+  }
+}
+
+#ifdef USE_CBLAS_VERITY
+static void float_to_int16(uint16_t *int16_data, float *float_data, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    int16_data[i] = convert_fp32_bf16(float_data[i]);
+  }
+}
+
+static void float_to_int32(uint32_t *int32_data, float *float_data, size_t size) {
+  for (size_t i = 0; i < size; i++) {
+    int32_data[i] = (uint32_t)float_data[i];
+  }
+}
+#endif
+
+// int8
+static int _test_bmblas_gemm_bm1880v2(size_t M, size_t N, size_t K, cvk_fmt_t fmt) {
+  long elapsed;
+  struct timeval t0, t1;
+  int ret = 0;
+
+  uint8_t *i8_A = new uint8_t[M * K];
+  uint8_t *i8_B = new uint8_t[N * K];
+  uint8_t *i8_C = new uint8_t[4 * M * N];  // 32 bit output
+  uint32_t *i32bit_ref = new uint32_t[M * N];
+
+  assign_i8_values_to_matrix(i8_A, M * K);
+  assign_i8_values_to_matrix(i8_B, N * K);
+
+  float *float_A = new float[M * K];
+  float *float_B = new float[N * K];
+  float *float_C_ref = new float[M * N];
+  uint8_to_float(float_A, i8_A, M * K, fmt);
+  uint8_to_float(float_B, i8_B, N * K, fmt);
+
+#if 0
+  printf("\nA:");
+  for (int i = 0; i < M; i++) {
+      printf("\n");
+  for (int j = 0; j < K; j++) {
+      printf("%e(0x%x) ", float_A[i * K + j], i8_A[i * K + j]);
+  }
+  }
+  printf("\nB:");
+  for (int i = 0; i < K; i++) {
+      printf("\n");
+  for (int j = 0; j < N; j++) {
+      printf("%e(0x%x) ", float_B[i * N + j], i8_B[i * N + j]);
+  }
+  }
+  printf("\nR:");
+  for (int i = 0; i < M; i++) {
+      printf("\n");
+  for (int j = 0; j < N; j++) {
+      printf("%e ", convert_i8_fp32(i32bit_ref[i * N + j]));
+  }
+  }
+#endif
+  gettimeofday(&t0, NULL);
+
+#ifdef USE_CBLAS_VERITY
+  float alpha = 0;
+  float beta = 0;
+
+  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha, float_A, K, float_B, N,
+              beta, float_C_ref, N);
+  float_to_int32(i32bit_ref, float_C_ref, M * N);
+#else  /* ! ifdef USE_CBLAS_VERITY */
+  multiply_i32(i8_A, M, K, i8_B, K, N, i32bit_ref, fmt);
+#endif /* ifdef USE_CBLAS_VERITY */
+
+  gettimeofday(&t1, NULL);
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+#ifdef USE_CBLAS_VERITY
+  printf("cblas GEMM takes %ld us\n", elapsed);
+#else  /* ! ifdef USE_CBLAS_VERITY */
+  printf("CPU GEMM takes %ld us\n", elapsed);
+#endif /* ifdef USE_CBLAS_VERITY */
+
+  CVI_RT_HANDLE ctx;
+  cvk_context_t *bk_ctx;
+
+  test_init(&ctx, &bk_ctx);
+
+  // alloc device memory
+  cvk_mg_shape_t s_a = {(uint32_t)M, (uint32_t)K};
+  cvk_mg_shape_t s_b = {(uint32_t)K, (uint32_t)N};
+  cvk_mg_shape_t s_r = {2 * (uint32_t)M, 2 * (uint32_t)N};
+
+  size_t s_size_a = mg_shape_size(&s_a) * bytesize_of_fmt(fmt);
+  size_t s_size_b = mg_shape_size(&s_b) * bytesize_of_fmt(fmt);
+  size_t s_size_r = mg_shape_size(&s_r) * bytesize_of_fmt(fmt);
+
+  CVI_RT_MEM devmem_a = CVI_RT_MemAlloc(ctx, s_size_a);
+  CVI_RT_MEM devmem_b = CVI_RT_MemAlloc(ctx, s_size_b);
+  CVI_RT_MEM devmem_r = CVI_RT_MemAlloc(ctx, s_size_r);
+
+  gaddr_t gaddr_a = CVI_RT_MemGetPAddr(devmem_a);
+  gaddr_t gaddr_b = CVI_RT_MemGetPAddr(devmem_b);
+  gaddr_t gaddr_r = CVI_RT_MemGetPAddr(devmem_r);
+
+  // copy to device memory
+  CVI_RT_MemCopyS2D(ctx, devmem_a, (uint8_t *)i8_A);
+  CVI_RT_MemCopyS2D(ctx, devmem_b, (uint8_t *)i8_B);
+  CVI_RT_MemCopyS2D(ctx, devmem_r, (uint8_t *)i8_C);
+
+  // do computation with bmkernel
+  // bmruntime_bmkernel_create(ctx, (void**)&bk_ctx);
+
+  // printf("gaddr_a/gaddr_b/gaddr_r at %zx %zx %zx\n", gaddr_a, gaddr_b, gaddr_r);
+  size_t *slice_num = cvm_gemm((cvk_context_t *)bk_ctx, gaddr_a, gaddr_b, gaddr_r, M, K, N, fmt);
+
+  gettimeofday(&t0, NULL);
+  test_submit_comp(&ctx, bk_ctx);
+  gettimeofday(&t1, NULL);
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+  printf("TPU GEMM takes %ld us\n", elapsed);
+
+  CVI_RT_MemCopyD2S(ctx, (uint8_t *)i8_C, devmem_r);
+
+  CVI_RT_MemFree(ctx, devmem_a);
+  CVI_RT_MemFree(ctx, devmem_b);
+  CVI_RT_MemFree(ctx, devmem_r);
+
+  test_exit(&ctx, bk_ctx);
+
+  uint32_t *i32_C = new uint32_t[M * N];  // 32 bit output with stirded
+
+  cvm_combin_gemm_i8(slice_num, i8_C, i32_C, M, N);
+
+  free(slice_num);
+
+  int cmp_res = array_cmp_int32("gemm", i32bit_ref, i32_C, M * N);
+  if (cmp_res != 0) {
+    ret = -1;
+    printf("Comparison failed for cblas_sgemm and bmblas_gemm!\n");
+#if 0
+    printf("\nref/cmd is:");
+    for (int i = 0; i < M; i++) {
+      printf(">\n");
+      for (int j = 0; j < N; j++) {
+        printf("%f(0x%x)/%f(0x%x) ",
+            convert_i8_fp32(i32bit_ref[i * N + j]), i32bit_ref[i * N + j],
+            convert_i8_fp32(i8_C[i * N + j]), i8_C[i * N + j]
+            );
+      }
+    }
+#endif
+  } else {
+    // printf("Comparison done for cblas_sgemm and bmblas_gemm!\n\n");
+  }
+
+  delete[] float_A;
+  delete[] float_B;
+  delete[] float_C_ref;
+  delete[] i8_A;
+  delete[] i8_B;
+  delete[] i8_C;
+  delete[] i32bit_ref;
+  delete[] i32_C;
+  return ret;
+}
+
+int test_bmblas_gemm_bm1880v2(size_t M, size_t N, size_t K, cvk_fmt_t fmt) {
+  printf("%s: M=%zu, N=%zu, K=%zu, fmt_sz: %d\n", __func__, M, N, K, cvm_bytesize_of_fmt(fmt));
+
+  // FIXME: not duplicate
+  if (fmt != CVK_FMT_BF16) {
+    return _test_bmblas_gemm_bm1880v2(M, N, K, fmt);
+  }
+
+  long elapsed;
+  struct timeval t0, t1;
+  int ret = 0;
+
+  uint16_t *bf16_A = new uint16_t[M * K];
+  uint16_t *bf16_B = new uint16_t[N * K];
+  uint16_t *bf16_C = new uint16_t[2 * M * N];
+  uint16_t *int16_C_ref = new uint16_t[M * N];
+
+  assign_bf16_values_to_matrix(bf16_A, M * K);
+  assign_bf16_values_to_matrix(bf16_B, N * K);
+
+  float *float_A = new float[M * K];
+  float *float_B = new float[N * K];
+  float *float_C_ref = new float[M * N];
+  uint16_to_float(float_A, bf16_A, M * K);
+  uint16_to_float(float_B, bf16_B, N * K);
+
+#if 0
+  printf("\nA:");
+  for (int i = 0; i < M; i++) {
+      printf("\n");
+  for (int j = 0; j < K; j++) {
+      printf("%e(0x%x) ", float_A[i * K + j], bf16_A[i * K + j]);
+  }
+  }
+  printf("\nB:");
+  for (int i = 0; i < K; i++) {
+      printf("\n");
+  for (int j = 0; j < N; j++) {
+      printf("%e(0x%x) ", float_B[i * N + j], bf16_B[i * N + j]);
+  }
+  }
+  printf("\nR:");
+  for (int i = 0; i < M; i++) {
+      printf("\n");
+  for (int j = 0; j < N; j++) {
+      printf("%e ", convert_bf16_fp32(int16_C_ref[i * N + j]));
+  }
+  }
+#endif
+  gettimeofday(&t0, NULL);
+
+#ifdef USE_CBLAS_VERITY
+  float alpha = 0;
+  float beta = 0;
+  cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, M, N, K, alpha, float_A, K, float_B, N,
+              beta, float_C_ref, N);
+  float_to_int16(int16_C_ref, float_C_ref, M * N);
+#else  /* ! ifdef USE_CBLAS_VERITY */
+  multiply(bf16_A, M, K, bf16_B, K, N, int16_C_ref);
+#endif /* ifdef USE_CBLAS_VERITY */
+
+  delete[] float_A;
+  delete[] float_B;
+  delete[] float_C_ref;
+
+  gettimeofday(&t1, NULL);
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+#ifdef USE_CBLAS_VERITY
+  printf("cblas GEMM takes %ld us\n", elapsed);
+#else
+  printf("CPU GEMM takes %ld us\n", elapsed);
+#endif
+
+  CVI_RT_HANDLE ctx;
+  cvk_context_t *bk_ctx;
+
+  test_init(&ctx, &bk_ctx);
+
+  // alloc device memory
+  cvk_mg_shape_t s_a = {(uint32_t)M, (uint32_t)K};
+  cvk_mg_shape_t s_b = {(uint32_t)K, (uint32_t)N};
+  cvk_mg_shape_t s_r = {(uint32_t)M, (uint32_t)N};
+
+  size_t s_size_a = mg_shape_size(&s_a) * bytesize_of_fmt(fmt);
+  size_t s_size_b = mg_shape_size(&s_b) * bytesize_of_fmt(fmt);
+  size_t s_size_r = mg_shape_size(&s_r) * bytesize_of_fmt(fmt) * bytesize_of_fmt(fmt);
+
+  CVI_RT_MEM devmem_a = CVI_RT_MemAlloc(ctx, s_size_a);
+  CVI_RT_MEM devmem_b = CVI_RT_MemAlloc(ctx, s_size_b);
+  CVI_RT_MEM devmem_r = CVI_RT_MemAlloc(ctx, s_size_r);
+
+  gaddr_t gaddr_a = CVI_RT_MemGetPAddr(devmem_a);
+  gaddr_t gaddr_b = CVI_RT_MemGetPAddr(devmem_b);
+  gaddr_t gaddr_r = CVI_RT_MemGetPAddr(devmem_r);
+
+  // copy to device memory
+  CVI_RT_MemCopyS2D(ctx, devmem_a, (uint8_t *)bf16_A);
+  CVI_RT_MemCopyS2D(ctx, devmem_b, (uint8_t *)bf16_B);
+  CVI_RT_MemCopyS2D(ctx, devmem_r, (uint8_t *)bf16_C);
+  // do computation with bmkernel
+  // bmruntime_bmkernel_create(ctx, (void**)&bk_ctx);
+
+  size_t *slice_num =
+      cvm_gemm((cvk_context_t *)bk_ctx, gaddr_a, gaddr_b, gaddr_r, M, K, N, CVK_FMT_BF16);
+  free(slice_num);  // no use slice_num infomation in BF16
+
+  gettimeofday(&t0, NULL);
+  test_submit_comp(&ctx, bk_ctx);
+  gettimeofday(&t1, NULL);
+  elapsed = (t1.tv_sec - t0.tv_sec) * 1000000 + t1.tv_usec - t0.tv_usec;
+  printf("TPU GEMM takes %ld us\n", elapsed);
+
+  CVI_RT_MemCopyD2S(ctx, (uint8_t *)bf16_C, devmem_r);
+
+  // bmruntime_bmkernel_destroy(ctx);
+
+  CVI_RT_MemFree(ctx, devmem_a);
+  CVI_RT_MemFree(ctx, devmem_b);
+  CVI_RT_MemFree(ctx, devmem_r);
+
+  test_exit(&ctx, bk_ctx);
+
+  int cmp_res = array_cmp_int16("gemm", int16_C_ref, bf16_C, M * N);
+  if (cmp_res != 0) {
+    ret = -1;
+    printf("Comparison failed for cblas_sgemm and bmblas_gemm!\n");
+#if 0
+    printf("\nref/cmd is:");
+    for (int i = 0; i < M; i++) {
+      printf(">\n");
+      for (int j = 0; j < N; j++) {
+        printf("%f(0x%x)/%f(0x%x) ",
+            convert_bf16_fp32(int16_C_ref[i * N + j]), int16_C_ref[i * N + j],
+            convert_bf16_fp32(bf16_C[i * N + j]), bf16_C[i * N + j]
+            );
+      }
+    }
+#endif
+  } else {
+    // printf("Comparison done for cblas_sgemm and bmblas_gemm!\n\n");
+  }
+
+  delete[] bf16_A;
+  delete[] bf16_B;
+  delete[] bf16_C;
+  delete[] int16_C_ref;
+  return ret;
+}
+
+#define test_one_param(n)          \
+  do {                             \
+    param_t p = param_##n(bk_ctx); \
+    test_param(&ctx, bk_ctx, &p);  \
+    destroy_param(bk_ctx, &p);     \
+  } while (0)
+
+int main() {
+  int round_mode;
+  round_mode = set_store_feround();
+  CVI_RT_HANDLE ctx;
+  cvk_context_t *bk_ctx;
+
+  test_init(&ctx, &bk_ctx);
+
+  // int8 example
+  if (0 != test_bmblas_gemm_bm1880v2(1, 100, 512, CVK_FMT_I8)) exit(-1);
+  if (0 != test_bmblas_gemm_bm1880v2(2, 100, 512, CVK_FMT_I8)) exit(-1);
+  if (0 != test_bmblas_gemm_bm1880v2(4, 100, 512, CVK_FMT_I8)) exit(-1);
+  if (0 != test_bmblas_gemm_bm1880v2(8, 100, 512, CVK_FMT_I8)) exit(-1);
+
+  if (0 != test_bmblas_gemm_bm1880v2(1, 20000, 512, CVK_FMT_I8)) exit(-1);
+  if (0 != test_bmblas_gemm_bm1880v2(10, 200, 10, CVK_FMT_I8)) exit(-1);
+  if (0 != test_bmblas_gemm_bm1880v2(1, 200, 500, CVK_FMT_I8)) exit(-1);
+  if (0 != test_bmblas_gemm_bm1880v2(1, 20, 50, CVK_FMT_I8)) exit(-1);
+  if (0 != test_bmblas_gemm_bm1880v2(2, 10, 100, CVK_FMT_I8)) exit(-1);
+  if (0 != test_bmblas_gemm_bm1880v2(2, 1000, 5, CVK_FMT_I8)) exit(-1);
+  if (0 != test_bmblas_gemm_bm1880v2(20, 5, 5, CVK_FMT_I8)) exit(-1);
+  if (0 != test_bmblas_gemm_bm1880v2(2, 5, 5, CVK_FMT_I8)) exit(-1);
+  cvk_fmt_t fmts[2] = {CVK_FMT_BF16, CVK_FMT_I8};
+  // cvk_fmt_t fmts[1] = {CVK_FMT_BF16};
+  int fmts_sz = sizeof(fmts) / sizeof(fmts[0]);
+
+  for (int i = 0; i < fmts_sz; i++) {
+    cvk_fmt_t fmt = fmts[i];
+    if (0) {
+      // backend implement
+      for (int i = 0; i < 30; i++) test_one_param(0);
+
+    } else {
+      // gemm, plz refer bmtap2/libbmblas
+      int M = 10000;
+      int N = 10000;
+      int K = 1024;
+      M = 2000;
+      N = 2000;
+      int m, k, n;
+
+      if (0) {
+        for (m = 1; m <= M; m *= 10) {
+          for (n = 1; n <= N; n += 200) {
+            for (k = 1; k <= K; k *= 2) {
+              if (0 != test_bmblas_gemm_bm1880v2(m, n, k, fmt)) {
+                exit(-1);
+              }
+            }
+          }
+        }
+      }
+
+      if (1) {
+        if (0 != test_bmblas_gemm_bm1880v2(1, 500, 512, fmt)) exit(-1);
+        if (0 != test_bmblas_gemm_bm1880v2(1, 750, 512, fmt)) exit(-1);
+        if (0 != test_bmblas_gemm_bm1880v2(1, 100, 512, fmt)) exit(-1);
+        if (0 != test_bmblas_gemm_bm1880v2(2, 100, 512, fmt)) exit(-1);
+        if (0 != test_bmblas_gemm_bm1880v2(4, 100, 512, fmt)) exit(-1);
+        if (0 != test_bmblas_gemm_bm1880v2(8, 100, 512, fmt)) exit(-1);
+        // if (0 != test_bmblas_gemm_bm1880v2(1, 50000, 512, fmt)) exit(-1);
+        // if (0 != test_bmblas_gemm_bm1880v2(1, 75000, 512, fmt)) exit(-1);
+        // if (0 != test_bmblas_gemm_bm1880v2(1, 10000, 512, fmt)) exit(-1);
+        // if (0 != test_bmblas_gemm_bm1880v2(2, 10000, 512, fmt)) exit(-1);
+        // if (0 != test_bmblas_gemm_bm1880v2(4, 10000, 512, fmt)) exit(-1);
+        // if (0 != test_bmblas_gemm_bm1880v2(8, 10000, 512, fmt)) exit(-1);
+      }
+
+      printf("Comparison done for cblas_sgemm and bmblas_gemm!\n\n");
+    }
+  }
+
+  test_exit(&ctx, bk_ctx);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cvimath/tests/cvi1835/mask.cpp b/cvimath/tests/cvi1835/mask.cpp
new file mode 100644
index 000000000..98211ef11
--- /dev/null
+++ b/cvimath/tests/cvi1835/mask.cpp
@@ -0,0 +1,158 @@
+#include <cvimath_internal.h>
+#include <test_cvikernel_util.h>
+
+#define OUT
+#define IN
+#include <cfloat>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <random>
+#include <string>
+//#define DBG
+
+using namespace std;
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+// enum TEST_MODE {
+//  CVM_MASK_TYPE_GT_0 = 0,  // remain >  0
+//  //CVM_MASK_TYPE_GE_0,      // remain >= 0
+//  //CVM_MASK_TYPE_EQ_0,      // remain  = 0
+//  //CVM_MASK_TYPE_LT_0,      // remain <  0
+//  //CVM_MASK_TYPE_LE_0,      // remain <= 0
+//  CVM_MASK_MAX
+//};
+
+enum CVM_MASK_TYPE mode;
+
+struct pattern {
+  float *input;
+  float *ref;
+  int len;
+};
+#define SIZEOF(x) (sizeof(x) / sizeof(x[0]))
+float cvm_mask_type_gt_0_input[] = {-1 * pow(2, -62), -0.003, -1.0, -100000, 0.000001, 1, 1000,
+                                    pow(2, 62),       0};
+
+float cvm_mask_type_gt_0_output[] = {0, 0, 0, 0, 1, 1, 1, 1, 0};
+float cvm_mask_type_ge_0_output[] = {0, 0, 0, 0, 1, 1, 1, 1, 1};
+float cvm_mask_type_eq_0_output[] = {0, 0, 0, 0, 0, 0, 0, 0, 1};
+float cvm_mask_type_lt_0_output[] = {1, 1, 1, 1, 0, 0, 0, 0, 0};
+float cvm_mask_type_le_0_output[] = {1, 1, 1, 1, 0, 0, 0, 0, 1};
+
+int input_sz = sizeof(cvm_mask_type_gt_0_input) / sizeof(cvm_mask_type_gt_0_input[0]);
+
+static struct pattern patterns[] = {
+    {cvm_mask_type_gt_0_input, cvm_mask_type_gt_0_output, input_sz},
+    {cvm_mask_type_gt_0_input, cvm_mask_type_ge_0_output, input_sz},
+    {cvm_mask_type_gt_0_input, cvm_mask_type_eq_0_output, input_sz},
+    {cvm_mask_type_gt_0_input, cvm_mask_type_lt_0_output, input_sz},
+    {cvm_mask_type_gt_0_input, cvm_mask_type_le_0_output, input_sz},
+};
+
+static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) {
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+  struct pattern *p = &patterns[mode];
+  uint32_t input_n = 1;
+  uint32_t input_c = 1;
+  uint32_t input_h = 1;
+  uint32_t input_w = p->len;
+
+  cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
+  cvk_tl_shape_t ofmap_shape = ifmap_shape;
+  uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
+  uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  uint64_t ifmap_bytesize = ifmap_size * data_type_size;
+  uint64_t ofmap_bytesize = ofmap_size * data_type_size;
+
+  cvk_tl_shape_t table_shape;
+  uint64_t table_bytesize = cvm_lut_tbl_bytesize(bmk, &table_shape, fmt);
+
+  cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *out = tl_ofmap_bf16;
+  cvk_tl_t *tl_pos_neg_buf = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_0_idx_table = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+
+  // temp buf
+  cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_buf2 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_buf4 = test_alloc_tl(bmk, tl_ofmap_bf16->shape, fmt, /*align*/ 1);
+
+  uint16_t *input_data = (uint16_t *)xmalloc(ifmap_bytesize);
+  uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
+  uint16_t *table_data_atan_pos_neg = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *idx_0_table_data = (uint16_t *)xmalloc(table_bytesize);
+
+  cvm_gen_0_tbl(idx_0_table_data, &table_shape);
+  cvm_pos_neg_tbl(table_data_atan_pos_neg, &table_shape);
+
+  for (uint32_t i = 0; i < ifmap_size; i++) {
+    input_data[i] = convert_fp32_bf16(p->input[i]);
+    ref_data[i] = convert_fp32_bf16(p->ref[i]);
+  }
+
+  test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)input_data);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_pos_neg_buf, (uint8_t *)table_data_atan_pos_neg);
+  test_put_tensor_g2l_comp(ctx, bmk, tl_0_idx_table, (uint8_t *)idx_0_table_data);
+
+  cvm_emit_mask(bmk, tl_ifmap, tl_buf, tl_buf2, tl_buf4, tl_pos_neg_buf, tl_0_idx_table, out, fmt,
+                mode);
+
+  test_submit_comp(ctx, bmk);
+
+  uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, out);
+
+  for (uint32_t i = 0; i < ifmap_size; i++) {
+    if (ref_data[i] != ofmap_data[i]) {
+      fprintf(stderr,
+              "comparing failed at mode %d ofmap_data[%u] got %f(0x%x), ref "
+              "%f(0x%x)\n",
+              mode, i, convert_bf16_fp32(ofmap_data[i]), ofmap_data[i],
+              convert_bf16_fp32(ref_data[i]), ref_data[i]);
+      exit(-1);
+    }
+  }
+#if 0
+  if (!is_close) {
+    float input = convert_bf16_fp32(ifmap[i]);
+      }
+#endif
+  free_tl(bmk, tl_buf4);
+  free_tl(bmk, tl_buf2);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_0_idx_table);
+  free_tl(bmk, tl_pos_neg_buf);
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_ifmap);
+
+  free(input_data);
+  free(ref_data);
+  free(ofmap_data);
+  free(table_data_atan_pos_neg);
+  free(idx_0_table_data);
+}
+
+int main() {
+  CVI_RT_HANDLE ctx;
+  cvk_context_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  for (int i = CVM_MASK_TYPE_GT_0; i < CVM_MASK_MAX; i++) {
+    mode = static_cast<enum CVM_MASK_TYPE>(i);
+    printf("test mode %d...\n", mode);
+    testbench(&ctx, bmk);
+  }
+
+  test_exit(&ctx, bmk);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cvimath/tests/cvi1835/reciprocal.cpp b/cvimath/tests/cvi1835/reciprocal.cpp
new file mode 100644
index 000000000..586018dea
--- /dev/null
+++ b/cvimath/tests/cvi1835/reciprocal.cpp
@@ -0,0 +1,376 @@
+/**
+ */
+#include <cvimath_internal.h>
+#include <test_cvikernel_util.h>
+
+#include <cfloat>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <random>
+#include <string>
+//#define DBG
+
+using namespace std;
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0,   // pre-data + fix compare
+  GEN_POW_20_DATA_MAX_ERROR,  // generate 2^-20 ~ 2^20 value that check epsilon
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+
+static uint16_t test_pattern[] = {
+    0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52, 0x3A6C, 0x3A83, 0x3A90,
+    0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF, 0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17,
+    0x3B1D, 0x3B24, 0x3B2A, 0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65,
+    0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90, 0x3B93, 0x3B97, 0x3B9A,
+    0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE, 0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1,
+    0x3BC5, 0x3BC8, 0x3BCB, 0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9,
+    0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03, 0x3C05, 0x3C06, 0x3C08,
+    0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12, 0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C,
+    0x3C1D, 0x3C1F, 0x3C21, 0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F,
+    0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E, 0x3C40, 0x3C41, 0x3C43,
+    0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D, 0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57,
+    0x3C58, 0x3C5A, 0x3C5C, 0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A,
+    0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79, 0x3C7B, 0x3C7C, 0x3C7E,
+    0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84, 0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89,
+    0x3C8A, 0x3C8A, 0x3C8B, 0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93,
+    0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A, 0x3C9B, 0x3C9C, 0x3C9C,
+    0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1, 0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6,
+    0x3CA7, 0x3CA8, 0x3CA9, 0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0,
+    0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8, 0x3CB8, 0x3CB9, 0x3CBA,
+    0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF, 0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4,
+    0x3CC5, 0x3CC5, 0x3CC6, 0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE,
+    0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5, 0x3CD6, 0x3CD7, 0x3CD7,
+    0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC, 0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1,
+    0x3CE2, 0x3CE3, 0x3CE4, 0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB,
+    0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2, 0x3CF3, 0x3CF4, 0x3CF5,
+    0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA, 0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF,
+    0x3D00, 0x3D00, 0x3D01, 0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04,
+    0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08, 0x3D08, 0x3D09, 0x3D09,
+    0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C, 0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E,
+    0x3D0F, 0x3D0F, 0x3D0F, 0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13,
+    0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17, 0x3D17, 0x3D18, 0x3D18,
+    0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A, 0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D,
+    0x3D1D, 0x3D1E, 0x3D1E, 0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22,
+    0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25, 0x3D26, 0x3D26, 0x3D27,
+    0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29, 0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C,
+    0x3D2C, 0x3D2C, 0x3D2D, 0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31,
+    0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34, 0x3D35, 0x3D35, 0x3D35,
+    0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38, 0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A,
+    0x3D3B, 0x3D3B, 0x3D3C, 0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F,
+    0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43, 0x3D43, 0x3D44, 0x3D44,
+    0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47, 0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49,
+    0x3D4A, 0x3D4A, 0x3D4A, 0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E,
+    0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52, 0x3D52, 0x3D53, 0x3D53,
+    0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55, 0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58,
+    0x3D58, 0x3D59, 0x3D59, 0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D,
+    0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60, 0x3D61, 0x3D61, 0x3D62,
+    0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64, 0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67,
+    0x3D67, 0x3D67, 0x3D68, 0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C,
+    0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F, 0x3D70, 0x3D70, 0x3D70,
+    0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73, 0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75,
+    0x3D76, 0x3D76, 0x3D77, 0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A,
+    0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E, 0x3D7E, 0x3D7F, 0x3D7F,
+    0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82,
+    0x3D82, 0x3D82, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85,
+    0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D87, 0x3D87, 0x3D87,
+    0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89,
+    0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C,
+    0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E,
+    0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91,
+    0x3D91, 0x3D91, 0x3D91, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93,
+    0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95, 0x3D95, 0x3D96, 0x3D96,
+    0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98,
+    0x3D98, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B,
+    0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9D, 0x3D9D, 0x3D9D,
+    0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0,
+    0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2,
+    0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4,
+    0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7,
+    0x3DA7, 0x3DA7, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9,
+    0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAC, 0x3DAC,
+    0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE,
+    0x3DAE, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1,
+    0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3,
+    0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6,
+    0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8,
+    0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBB,
+    0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD,
+    0x3DBD, 0x3DBD, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF,
+    0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC2, 0x3DC2, 0x3DC2,
+    0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4,
+    0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7,
+    0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9, 0x3DC9, 0x3DC9, 0x3DC9,
+    0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC,
+    0x3DCC, 0x3DCC, 0x3DCC, 0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5,
+    0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE, 0x3DDF, 0x3DE0, 0x3DE1,
+    0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5,
+};
+
+static uint16_t test_pattern_ref[] = {
+    0x7f7f, 0x461c, 0x459c, 0x4551, 0x451c, 0x44fa, 0x44d1, 0x44b2, 0x449c, 0x448b, 0x447a, 0x4464,
+    0x4451, 0x4441, 0x4432, 0x4426, 0x441c, 0x4413, 0x440b, 0x4404, 0x43fa, 0x43ed, 0x43e4, 0x43d9,
+    0x43d1, 0x43c8, 0x43c1, 0x43b9, 0x43b2, 0x43ac, 0x43a6, 0x43a1, 0x439c, 0x4398, 0x4393, 0x438f,
+    0x438b, 0x4387, 0x4384, 0x4380, 0x437a, 0x4375, 0x436d, 0x4368, 0x4364, 0x435f, 0x4359, 0x4355,
+    0x4351, 0x434c, 0x4348, 0x4344, 0x4341, 0x433c, 0x4339, 0x4336, 0x4332, 0x432f, 0x432c, 0x432a,
+    0x4326, 0x4324, 0x4321, 0x431f, 0x431c, 0x431a, 0x4318, 0x4315, 0x4313, 0x4311, 0x430f, 0x430d,
+    0x430b, 0x4309, 0x4307, 0x4305, 0x4304, 0x4302, 0x4300, 0x42fe, 0x42fa, 0x42f6, 0x42f5, 0x42f1,
+    0x42ed, 0x42ec, 0x42e8, 0x42e5, 0x42e4, 0x42e0, 0x42df, 0x42dc, 0x42d9, 0x42d8, 0x42d5, 0x42d2,
+    0x42d1, 0x42ce, 0x42cc, 0x42ca, 0x42c8, 0x42c7, 0x42c4, 0x42c2, 0x42c1, 0x42bf, 0x42bc, 0x42bb,
+    0x42b9, 0x42b7, 0x42b6, 0x42b4, 0x42b2, 0x42b1, 0x42af, 0x42ae, 0x42ac, 0x42ab, 0x42aa, 0x42a8,
+    0x42a6, 0x42a5, 0x42a4, 0x42a2, 0x42a1, 0x42a0, 0x429f, 0x429e, 0x429c, 0x429b, 0x429a, 0x4298,
+    0x4298, 0x4296, 0x4295, 0x4294, 0x4293, 0x4292, 0x4291, 0x4290, 0x428f, 0x428e, 0x428d, 0x428c,
+    0x428b, 0x428a, 0x4289, 0x4288, 0x4287, 0x4286, 0x4285, 0x4285, 0x4284, 0x4283, 0x4282, 0x4281,
+    0x4280, 0x427e, 0x427e, 0x427c, 0x427a, 0x4278, 0x4276, 0x4275, 0x4275, 0x4273, 0x4271, 0x426f,
+    0x426d, 0x426d, 0x426c, 0x426a, 0x4268, 0x4267, 0x4265, 0x4265, 0x4264, 0x4262, 0x4260, 0x425f,
+    0x425f, 0x425d, 0x425c, 0x425a, 0x4259, 0x4258, 0x4258, 0x4256, 0x4255, 0x4253, 0x4252, 0x4252,
+    0x4251, 0x424f, 0x424e, 0x424d, 0x424c, 0x424c, 0x424a, 0x4249, 0x4248, 0x4247, 0x4247, 0x4245,
+    0x4244, 0x4243, 0x4242, 0x4241, 0x4241, 0x4240, 0x423f, 0x423d, 0x423c, 0x423c, 0x423b, 0x423a,
+    0x4239, 0x4238, 0x4237, 0x4237, 0x4236, 0x4235, 0x4234, 0x4233, 0x4232, 0x4232, 0x4231, 0x4230,
+    0x422f, 0x422e, 0x422e, 0x422d, 0x422c, 0x422c, 0x422b, 0x422a, 0x422a, 0x4229, 0x4228, 0x4227,
+    0x4226, 0x4226, 0x4225, 0x4225, 0x4224, 0x4223, 0x4222, 0x4222, 0x4221, 0x4221, 0x4220, 0x421f,
+    0x421f, 0x421e, 0x421e, 0x421d, 0x421c, 0x421b, 0x421b, 0x421b, 0x421a, 0x4219, 0x4218, 0x4218,
+    0x4218, 0x4217, 0x4216, 0x4216, 0x4215, 0x4215, 0x4214, 0x4214, 0x4213, 0x4212, 0x4212, 0x4212,
+    0x4211, 0x4210, 0x4210, 0x420f, 0x420f, 0x420e, 0x420e, 0x420d, 0x420d, 0x420d, 0x420c, 0x420b,
+    0x420b, 0x420a, 0x420a, 0x420a, 0x4209, 0x4209, 0x4208, 0x4207, 0x4207, 0x4207, 0x4206, 0x4206,
+    0x4205, 0x4205, 0x4205, 0x4204, 0x4204, 0x4203, 0x4203, 0x4203, 0x4202, 0x4202, 0x4201, 0x4201,
+    0x4200, 0x4200, 0x41fe, 0x41fe, 0x41fe, 0x41fc, 0x41fc, 0x41fa, 0x41fa, 0x41fa, 0x41f8, 0x41f8,
+    0x41f6, 0x41f6, 0x41f5, 0x41f5, 0x41f5, 0x41f3, 0x41f3, 0x41f1, 0x41f1, 0x41f1, 0x41ef, 0x41ef,
+    0x41ed, 0x41ed, 0x41ed, 0x41ec, 0x41ec, 0x41ea, 0x41ea, 0x41ea, 0x41e8, 0x41e8, 0x41e7, 0x41e7,
+    0x41e5, 0x41e5, 0x41e5, 0x41e4, 0x41e4, 0x41e2, 0x41e2, 0x41e2, 0x41e0, 0x41e0, 0x41df, 0x41df,
+    0x41df, 0x41dd, 0x41dd, 0x41dc, 0x41dc, 0x41da, 0x41da, 0x41da, 0x41d9, 0x41d9, 0x41d8, 0x41d8,
+    0x41d8, 0x41d6, 0x41d6, 0x41d5, 0x41d5, 0x41d5, 0x41d3, 0x41d3, 0x41d2, 0x41d2, 0x41d2, 0x41d1,
+    0x41d1, 0x41cf, 0x41cf, 0x41ce, 0x41ce, 0x41ce, 0x41cd, 0x41cd, 0x41cc, 0x41cc, 0x41cc, 0x41ca,
+    0x41ca, 0x41c9, 0x41c9, 0x41c9, 0x41c8, 0x41c8, 0x41c7, 0x41c7, 0x41c7, 0x41c5, 0x41c5, 0x41c4,
+    0x41c4, 0x41c3, 0x41c3, 0x41c3, 0x41c2, 0x41c2, 0x41c1, 0x41c1, 0x41c1, 0x41c0, 0x41c0, 0x41bf,
+    0x41bf, 0x41bf, 0x41bd, 0x41bd, 0x41bc, 0x41bc, 0x41bc, 0x41bb, 0x41bb, 0x41ba, 0x41ba, 0x41b9,
+    0x41b9, 0x41b9, 0x41b8, 0x41b8, 0x41b7, 0x41b7, 0x41b7, 0x41b6, 0x41b6, 0x41b5, 0x41b5, 0x41b5,
+    0x41b4, 0x41b4, 0x41b3, 0x41b3, 0x41b2, 0x41b2, 0x41b2, 0x41b1, 0x41b1, 0x41b0, 0x41b0, 0x41b0,
+    0x41af, 0x41af, 0x41ae, 0x41ae, 0x41ae, 0x41ad, 0x41ad, 0x41ac, 0x41ac, 0x41ac, 0x41ac, 0x41ac,
+    0x41ab, 0x41ab, 0x41aa, 0x41aa, 0x41aa, 0x41a9, 0x41a9, 0x41a8, 0x41a8, 0x41a8, 0x41a7, 0x41a7,
+    0x41a6, 0x41a6, 0x41a6, 0x41a5, 0x41a5, 0x41a5, 0x41a5, 0x41a5, 0x41a4, 0x41a4, 0x41a3, 0x41a3,
+    0x41a2, 0x41a2, 0x41a2, 0x41a1, 0x41a1, 0x41a1, 0x41a1, 0x41a1, 0x41a0, 0x41a0, 0x419f, 0x419f,
+    0x419f, 0x419e, 0x419e, 0x419e, 0x419e, 0x419e, 0x419d, 0x419d, 0x419c, 0x419c, 0x419b, 0x419b,
+    0x419b, 0x419b, 0x419b, 0x419a, 0x419a, 0x419a, 0x4199, 0x4199, 0x4198, 0x4198, 0x4198, 0x4198,
+    0x4198, 0x4197, 0x4197, 0x4197, 0x4196, 0x4196, 0x4196, 0x4196, 0x4195, 0x4195, 0x4195, 0x4194,
+    0x4194, 0x4194, 0x4194, 0x4194, 0x4193, 0x4193, 0x4192, 0x4192, 0x4192, 0x4192, 0x4192, 0x4191,
+    0x4191, 0x4190, 0x4190, 0x4190, 0x4190, 0x4190, 0x418f, 0x418f, 0x418f, 0x418e, 0x418e, 0x418e,
+    0x418e, 0x418e, 0x418d, 0x418d, 0x418d, 0x418d, 0x418d, 0x418c, 0x418c, 0x418b, 0x418b, 0x418b,
+    0x418b, 0x418b, 0x418a, 0x418a, 0x418a, 0x418a, 0x418a, 0x4189, 0x4189, 0x4189, 0x4189, 0x4189,
+    0x4188, 0x4188, 0x4187, 0x4187, 0x4187, 0x4187, 0x4187, 0x4186, 0x4186, 0x4186, 0x4186, 0x4186,
+    0x4185, 0x4185, 0x4185, 0x4185, 0x4185, 0x4184, 0x4184, 0x4184, 0x4184, 0x4184, 0x4183, 0x4183,
+    0x4183, 0x4183, 0x4183, 0x4182, 0x4182, 0x4182, 0x4182, 0x4181, 0x4181, 0x4181, 0x4181, 0x4181,
+    0x4180, 0x4180, 0x4180, 0x4180, 0x417e, 0x417e, 0x417e, 0x417e, 0x417e, 0x417c, 0x417c, 0x417c,
+    0x417c, 0x417c, 0x417a, 0x417a, 0x417a, 0x417a, 0x417a, 0x4178, 0x4178, 0x4178, 0x4178, 0x4176,
+    0x4176, 0x4176, 0x4176, 0x4176, 0x4175, 0x4175, 0x4175, 0x4175, 0x4175, 0x4173, 0x4173, 0x4173,
+    0x4173, 0x4173, 0x4171, 0x4171, 0x4171, 0x4171, 0x4171, 0x416f, 0x416f, 0x416f, 0x416f, 0x416f,
+    0x416d, 0x416d, 0x416d, 0x416d, 0x416d, 0x416c, 0x416c, 0x416c, 0x416c, 0x416c, 0x416a, 0x416a,
+    0x416a, 0x416a, 0x416a, 0x4168, 0x4168, 0x4168, 0x4168, 0x4167, 0x4167, 0x4167, 0x4167, 0x4167,
+    0x4165, 0x4165, 0x4165, 0x4165, 0x4165, 0x4164, 0x4164, 0x4164, 0x4164, 0x4164, 0x4162, 0x4162,
+    0x4162, 0x4162, 0x4162, 0x4160, 0x4160, 0x4160, 0x4160, 0x4160, 0x415f, 0x415f, 0x415f, 0x415f,
+    0x415f, 0x415d, 0x415d, 0x415d, 0x415d, 0x415d, 0x415c, 0x415c, 0x415c, 0x415c, 0x415a, 0x415a,
+    0x415a, 0x415a, 0x415a, 0x4159, 0x4159, 0x4159, 0x4159, 0x4159, 0x4158, 0x4158, 0x4158, 0x4158,
+    0x4158, 0x4156, 0x4156, 0x4156, 0x4156, 0x4156, 0x4155, 0x4155, 0x4155, 0x4155, 0x4155, 0x4153,
+    0x4153, 0x4153, 0x4153, 0x4153, 0x4152, 0x4152, 0x4152, 0x4152, 0x4152, 0x4151, 0x4151, 0x4151,
+    0x4151, 0x4151, 0x414f, 0x414f, 0x414f, 0x414f, 0x414e, 0x414e, 0x414e, 0x414e, 0x414e, 0x414d,
+    0x414d, 0x414d, 0x414d, 0x414d, 0x414c, 0x414c, 0x414c, 0x414c, 0x414c, 0x414a, 0x414a, 0x414a,
+    0x414a, 0x414a, 0x4149, 0x4149, 0x4149, 0x4149, 0x4149, 0x4148, 0x4148, 0x4148, 0x4148, 0x4148,
+    0x4147, 0x4147, 0x4147, 0x4147, 0x4147, 0x4145, 0x4145, 0x4145, 0x4145, 0x4144, 0x4144, 0x4144,
+    0x4144, 0x4144, 0x4143, 0x4143, 0x4143, 0x4143, 0x4143, 0x4142, 0x4142, 0x4142, 0x4142, 0x4142,
+    0x4141, 0x4141, 0x4141, 0x4141, 0x4141, 0x4140, 0x4140, 0x4140, 0x4140, 0x4140, 0x413f, 0x413f,
+    0x413f, 0x413f, 0x413f, 0x413d, 0x413d, 0x413d, 0x413d, 0x413d, 0x413c, 0x413c, 0x413c, 0x413c,
+    0x413c, 0x413b, 0x413b, 0x413b, 0x413b, 0x413a, 0x413a, 0x413a, 0x413a, 0x413a, 0x4139, 0x4139,
+    0x4139, 0x4139, 0x4139, 0x4138, 0x4138, 0x4138, 0x4138, 0x4138, 0x4137, 0x4137, 0x4137, 0x4137,
+    0x4137, 0x4136, 0x4136, 0x4136, 0x4136, 0x4136, 0x4135, 0x4135, 0x4135, 0x4135, 0x4135, 0x4134,
+    0x4134, 0x4134, 0x4134, 0x4134, 0x4133, 0x4133, 0x4133, 0x4133, 0x4132, 0x4132, 0x4132, 0x4132,
+    0x4132, 0x4131, 0x4131, 0x4131, 0x4131, 0x4131, 0x4130, 0x4130, 0x4130, 0x4130, 0x4130, 0x412f,
+    0x412f, 0x412f, 0x412f, 0x412f, 0x412e, 0x412e, 0x412e, 0x412e, 0x412e, 0x412d, 0x412d, 0x412d,
+    0x412d, 0x412d, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c, 0x412c,
+    0x412b, 0x412b, 0x412b, 0x412b, 0x412a, 0x412a, 0x412a, 0x412a, 0x412a, 0x4129, 0x4129, 0x4129,
+    0x4129, 0x4129, 0x4128, 0x4128, 0x4128, 0x4128, 0x4128, 0x4127, 0x4127, 0x4127, 0x4127, 0x4127,
+    0x4126, 0x4126, 0x4126, 0x4126, 0x4126, 0x4125, 0x4125, 0x4125, 0x4125, 0x4125, 0x4125, 0x4125,
+    0x4125, 0x4125, 0x4125, 0x4124, 0x4124, 0x4124, 0x4124, 0x4124, 0x4123, 0x4123, 0x4123, 0x4123,
+    0x4122, 0x4122, 0x4122, 0x4122, 0x4122, 0x4121, 0x4121, 0x4121, 0x4121, 0x4121, 0x4121, 0x4121,
+    0x4121, 0x4121, 0x4121, 0x4120, 0x411f, 0x411e, 0x411e, 0x411d, 0x411c, 0x411b, 0x411b, 0x411a,
+    0x4119, 0x4118, 0x4118, 0x4117, 0x4116, 0x4116, 0x4115, 0x4114, 0x4114, 0x4113, 0x4112, 0x4112,
+    0x4111, 0x4110, 0x4110, 0x410f,
+};
+
+static void tl_lut_ref(uint16_t *ofmap, uint16_t *ifmap, cvk_tl_shape_t ifmap_shape) {
+  for (uint32_t i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      ofmap[i] = test_pattern_ref[i];
+    } else {
+      uint16_t v = convert_fp32_bf16(1 / (1.0 * (convert_bf16_fp32(ifmap[i]))));
+      ofmap[i] = v;
+    }
+  }
+}
+
+static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint16_t *ifmap,
+                   uint64_t ifmap_shape_size, TEST_MODE mode) {
+  uint64_t size = ifmap_shape_size;
+
+  for (uint64_t i = 0; i < size; i++) {
+    bool is_close;
+    uint16_t ref;
+    uint16_t ofmap_data_bf16;
+    float ref_f;
+    float ofmap_data_f;
+
+    ref = ref_data[i];
+    ref_f = convert_bf16_fp32(ref);
+    ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
+    ofmap_data_bf16 = ofmap_data[i];
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      is_close = ofmap_data[i] == ref;
+    } else {
+      is_close = fabs(ref_f - ofmap_data_f) < 0.001;
+    }
+
+    if (!is_close) {
+      fprintf(stderr,
+              "comparing failed at ofmap_data[%lu](input:%e), got %x, exp %x, "
+              "fp32: got %e exp %e\n",
+              i, convert_bf16_fp32(ifmap[i]), ofmap_data_bf16, ref, ofmap_data_f, ref_f);
+      exit(-1);
+    }
+  }
+
+  return true;
+}
+
+static void gen_input(uint16_t *ifmap, uint64_t ifmap_shape_size) {
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(ifmap, &test_pattern, sizeof(test_pattern));
+  } else {
+    for (uint64_t i = 0; i < ifmap_shape_size; i++) {
+      srand(static_cast<unsigned>(time(0)));
+      std::random_device rd;
+      std::mt19937 e2(rd());
+      float LO = pow(2, -10);
+      float HI = pow(2, 10);
+      // std::uniform_real_distribution<> dist(pow(2,-62), pow(2,63));
+      for (uint64_t i = 0; i < ifmap_shape_size; i++) {
+        // float r3 = dist(e2);
+        float r3 = LO + static_cast<float>(rand()) / (static_cast<float>(RAND_MAX / (HI - LO)));
+        ifmap[i] = convert_fp32_bf16(r3);
+      }
+    }
+  }
+
+#ifdef DBG
+  for (uint64_t i = 0; i < ifmap_shape_size; i++) {
+    printf("source if[%lu] bf16 %f 0x%x, log2f is %f\n", i, convert_bf16_fp32(ifmap[i]), ifmap[i],
+           floor(log2((convert_bf16_fp32(ifmap[i])))));
+  }
+#endif /* ifdef DBG */
+}
+
+static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk, uint32_t input_n, uint32_t input_c,
+                      uint32_t input_h, uint32_t input_w) {
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+
+  // TODO: check more shape / align
+  cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
+  cvk_tl_shape_t ofmap_shape = ifmap_shape;
+  cvk_tl_shape_t table_shape;
+  cvm_table_shape(bmk, &table_shape);
+
+  uint64_t ifmap_shape_size = tl_shape_size(&ifmap_shape);
+  uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
+  uint64_t table_size = tl_shape_size(&table_shape);
+
+  // prepare input data with size
+  int data_type_size = bytesize_of_fmt(fmt);
+  uint64_t ifmap_bytesize = ifmap_shape_size * data_type_size;
+  uint64_t ofmap_bytesize = ofmap_size * data_type_size;
+  uint64_t table_bytesize = table_size * data_type_size;
+
+  uint16_t *ifmap = (uint16_t *)xmalloc(ifmap_bytesize);
+  uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
+  uint16_t *table_data = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *table_data_mantissa = (uint16_t *)xmalloc(table_bytesize);
+
+  // alloc lmem
+  cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_buf = test_alloc_tl(bmk, tl_ifmap->shape, fmt, /*align*/ 1);
+  cvk_tl_t *cvk_tl_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *cvk_tl_table_answer_mantissa = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+
+  // generate testbench
+  gen_input(ifmap, ifmap_shape_size);
+  tl_lut_ref(ref_data, ifmap, ifmap_shape);
+
+  // prepare table
+  cvm_reciprocal_tbl(table_data, table_data_mantissa, &table_shape);
+
+  // sys->lmem
+  test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)ifmap);
+  test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer, (uint8_t *)table_data);
+  test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer_mantissa, (uint8_t *)table_data_mantissa);
+
+  cvm_emit_reciprocal(bmk, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_mantissa,
+                      tl_ofmap_bf16);
+
+  // issue cmd
+  test_submit_comp(ctx, bmk);
+
+  // get output from lmem->sys
+  uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, tl_ofmap_bf16);
+
+  verify(ofmap_data, ref_data, ifmap, ifmap_shape_size, mode);
+
+  free_tl(bmk, cvk_tl_table_answer_mantissa);
+  free_tl(bmk, cvk_tl_table_answer);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_ifmap);
+
+  free(ifmap);
+  free(ref_data);
+  free(ofmap_data);
+  free(table_data);
+  free(table_data_mantissa);
+}
+
+int main() {
+  CVI_RT_HANDLE ctx;
+  cvk_context_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  for (int i = GEN_POW_20_DATA_MAX_ERROR; i < TEST_MODE_MAX; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf("test mode %d...\n", mode);
+
+    int input_n = 1;
+    int input_c = 32;
+    int input_h = 1;
+    int input_w = 1;
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      input_h = 4;
+      input_w = 8;
+    } else {
+      input_h = input_w = 16;
+    }
+
+    testbench(&ctx, bmk, input_n, input_c, input_h, input_w);
+  }
+
+  test_exit(&ctx, bmk);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cvimath/tests/cvi1835/sigmoid_linear_interp.cpp b/cvimath/tests/cvi1835/sigmoid_linear_interp.cpp
new file mode 100644
index 000000000..1cb2fae38
--- /dev/null
+++ b/cvimath/tests/cvi1835/sigmoid_linear_interp.cpp
@@ -0,0 +1,907 @@
+//* TODO: you could rerange any value to -127~127
+#include <cvimath_internal.h>
+#include <test_cvikernel_util.h>
+
+#define OUT
+#define IN
+//#define DBG
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ * compare fix means we MAKE SURE output values equal with golden,
+ * comment it for check with error using `MAX_ERROR`
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0,  // pre-data + fix compare
+  PRE_DATA_MAX_ERROR,        // pre-data + compare only diff < MAX_ERROR
+  GEN_DATA_MAX_ERROR,        // gen data + compare only diff < MAX_ERROR
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+#define MAX_ERROR (0.004)
+
+using namespace std;
+static uint16_t test_pattern[] = {
+    0x0000, 0x3C03, 0x3C83, 0x3CC5, 0x3D03, 0x3D24, 0x3D45, 0x3D65, 0x3D83, 0x3D93, 0x3DA4, 0x3DB4,
+    0x3DC5, 0x3DD5, 0x3DE5, 0x3DF6, 0x3E03, 0x3E0B, 0x3E13, 0x3E1C, 0x3E24, 0x3E2C, 0x3E34, 0x3E3C,
+    0x3E45, 0x3E4D, 0x3E55, 0x3E5D, 0x3E65, 0x3E6E, 0x3E76, 0x3E7E, 0x3E83, 0x3E87, 0x3E8B, 0x3E8F,
+    0x3E93, 0x3E98, 0x3E9C, 0x3EA0, 0x3EA4, 0x3EA8, 0x3EAC, 0x3EB0, 0x3EB4, 0x3EB8, 0x3EBC, 0x3EC1,
+    0x3EC5, 0x3EC9, 0x3ECD, 0x3ED1, 0x3ED5, 0x3ED9, 0x3EDD, 0x3EE1, 0x3EE5, 0x3EE9, 0x3EEE, 0x3EF2,
+    0x3EF6, 0x3EFA, 0x3EFE, 0x3F01, 0x3F03, 0x3F05, 0x3F07, 0x3F09, 0x3F0B, 0x3F0D, 0x3F0F, 0x3F11,
+    0x3F13, 0x3F16, 0x3F18, 0x3F1A, 0x3F1C, 0x3F1E, 0x3F20, 0x3F22, 0x3F24, 0x3F26, 0x3F28, 0x3F2A,
+    0x3F2C, 0x3F2E, 0x3F30, 0x3F32, 0x3F34, 0x3F36, 0x3F38, 0x3F3A, 0x3F3C, 0x3F3E, 0x3F41, 0x3F43,
+    0x3F45, 0x3F47, 0x3F49, 0x3F4B, 0x3F4D, 0x3F4F, 0x3F51, 0x3F53, 0x3F55, 0x3F57, 0x3F59, 0x3F5B,
+    0x3F5D, 0x3F5F, 0x3F61, 0x3F63, 0x3F65, 0x3F67, 0x3F69, 0x3F6C, 0x3F6E, 0x3F70, 0x3F72, 0x3F74,
+    0x3F76, 0x3F78, 0x3F7A, 0x3F7C, 0x3F7E, 0x3F80, 0x3F81, 0x3F82, 0x3F83, 0x3F84, 0x3F85, 0x3F86,
+    0x3F87, 0x3F88, 0x3F89, 0x3F8A, 0x3F8B, 0x3F8C, 0x3F8D, 0x3F8E, 0x3F8F, 0x3F90, 0x3F91, 0x3F92,
+    0x3F93, 0x3F94, 0x3F96, 0x3F97, 0x3F98, 0x3F99, 0x3F9A, 0x3F9B, 0x3F9C, 0x3F9D, 0x3F9E, 0x3F9F,
+    0x3FA0, 0x3FA1, 0x3FA2, 0x3FA3, 0x3FA4, 0x3FA5, 0x3FA6, 0x3FA7, 0x3FA8, 0x3FA9, 0x3FAA, 0x3FAB,
+    0x3FAC, 0x3FAD, 0x3FAE, 0x3FAF, 0x3FB0, 0x3FB1, 0x3FB2, 0x3FB3, 0x3FB4, 0x3FB5, 0x3FB6, 0x3FB7,
+    0x3FB8, 0x3FB9, 0x3FBA, 0x3FBB, 0x3FBC, 0x3FBD, 0x3FBE, 0x3FBF, 0x3FC1, 0x3FC2, 0x3FC3, 0x3FC4,
+    0x3FC5, 0x3FC6, 0x3FC7, 0x3FC8, 0x3FC9, 0x3FCA, 0x3FCB, 0x3FCC, 0x3FCD, 0x3FCE, 0x3FCF, 0x3FD0,
+    0x3FD1, 0x3FD2, 0x3FD3, 0x3FD4, 0x3FD5, 0x3FD6, 0x3FD7, 0x3FD8, 0x3FD9, 0x3FDA, 0x3FDB, 0x3FDC,
+    0x3FDD, 0x3FDE, 0x3FDF, 0x3FE0, 0x3FE1, 0x3FE2, 0x3FE3, 0x3FE4, 0x3FE5, 0x3FE6, 0x3FE7, 0x3FE8,
+    0x3FE9, 0x3FEA, 0x3FEC, 0x3FED, 0x3FEE, 0x3FEF, 0x3FF0, 0x3FF1, 0x3FF2, 0x3FF3, 0x3FF4, 0x3FF5,
+    0x3FF6, 0x3FF7, 0x3FF8, 0x3FF9, 0x3FFA, 0x3FFB, 0x3FFC, 0x3FFD, 0x3FFE, 0x3FFF, 0x4000, 0x4001,
+    0x4001, 0x4002, 0x4002, 0x4003, 0x4003, 0x4004, 0x4004, 0x4005, 0x4005, 0x4006, 0x4006, 0x4007,
+    0x4007, 0x4008, 0x4008, 0x4009, 0x4009, 0x400A, 0x400A, 0x400B, 0x400B, 0x400C, 0x400C, 0x400D,
+    0x400D, 0x400E, 0x400E, 0x400F, 0x400F, 0x4010, 0x4010, 0x4011, 0x4011, 0x4012, 0x4012, 0x4013,
+    0x4013, 0x4014, 0x4014, 0x4015, 0x4016, 0x4016, 0x4017, 0x4017, 0x4018, 0x4018, 0x4019, 0x4019,
+    0x401A, 0x401A, 0x401B, 0x401B, 0x401C, 0x401C, 0x401D, 0x401D, 0x401E, 0x401E, 0x401F, 0x401F,
+    0x4020, 0x4020, 0x4021, 0x4021, 0x4022, 0x4022, 0x4023, 0x4023, 0x4024, 0x4024, 0x4025, 0x4025,
+    0x4026, 0x4026, 0x4027, 0x4027, 0x4028, 0x4028, 0x4029, 0x4029, 0x402A, 0x402A, 0x402B, 0x402C,
+    0x402C, 0x402D, 0x402D, 0x402E, 0x402E, 0x402F, 0x402F, 0x4030, 0x4030, 0x4031, 0x4031, 0x4032,
+    0x4032, 0x4033, 0x4033, 0x4034, 0x4034, 0x4035, 0x4035, 0x4036, 0x4036, 0x4037, 0x4037, 0x4038,
+    0x4038, 0x4039, 0x4039, 0x403A, 0x403A, 0x403B, 0x403B, 0x403C, 0x403C, 0x403D, 0x403D, 0x403E,
+    0x403E, 0x403F, 0x403F, 0x4040, 0x4041, 0x4041, 0x4042, 0x4042, 0x4043, 0x4043, 0x4044, 0x4044,
+    0x4045, 0x4045, 0x4046, 0x4046, 0x4047, 0x4047, 0x4048, 0x4048, 0x4049, 0x4049, 0x404A, 0x404A,
+    0x404B, 0x404B, 0x404C, 0x404C, 0x404D, 0x404D, 0x404E, 0x404E, 0x404F, 0x404F, 0x4050, 0x4050,
+    0x4051, 0x4051, 0x4052, 0x4052, 0x4053, 0x4053, 0x4054, 0x4054, 0x4055, 0x4056, 0x4056, 0x4057,
+    0x4057, 0x4058, 0x4058, 0x4059, 0x4059, 0x405A, 0x405A, 0x405B, 0x405B, 0x405C, 0x405C, 0x405D,
+    0x405D, 0x405E, 0x405E, 0x405F, 0x405F, 0x4060, 0x4060, 0x4061, 0x4061, 0x4062, 0x4062, 0x4063,
+    0x4063, 0x4064, 0x4064, 0x4065, 0x4065, 0x4066, 0x4066, 0x4067, 0x4067, 0x4068, 0x4068, 0x4069,
+    0x4069, 0x406A, 0x406A, 0x406B, 0x406C, 0x406C, 0x406D, 0x406D, 0x406E, 0x406E, 0x406F, 0x406F,
+    0x4070, 0x4070, 0x4071, 0x4071, 0x4072, 0x4072, 0x4073, 0x4073, 0x4074, 0x4074, 0x4075, 0x4075,
+    0x4076, 0x4076, 0x4077, 0x4077, 0x4078, 0x4078, 0x4079, 0x4079, 0x407A, 0x407A, 0x407B, 0x407B,
+    0x407C, 0x407C, 0x407D, 0x407D, 0x407E, 0x407E, 0x407F, 0x407F, 0x4080, 0x4080, 0x4081, 0x4081,
+    0x4081, 0x4081, 0x4082, 0x4082, 0x4082, 0x4082, 0x4083, 0x4083, 0x4083, 0x4083, 0x4084, 0x4084,
+    0x4084, 0x4084, 0x4085, 0x4085, 0x4085, 0x4085, 0x4086, 0x4086, 0x4086, 0x4086, 0x4087, 0x4087,
+    0x4087, 0x4087, 0x4088, 0x4088, 0x4088, 0x4088, 0x4089, 0x4089, 0x4089, 0x4089, 0x408A, 0x408A,
+    0x408A, 0x408A, 0x408B, 0x408B, 0x408B, 0x408C, 0x408C, 0x408C, 0x408C, 0x408D, 0x408D, 0x408D,
+    0x408D, 0x408E, 0x408E, 0x408E, 0x408E, 0x408F, 0x408F, 0x408F, 0x408F, 0x4090, 0x4090, 0x4090,
+    0x4090, 0x4091, 0x4091, 0x4091, 0x4091, 0x4092, 0x4092, 0x4092, 0x4092, 0x4093, 0x4093, 0x4093,
+    0x4093, 0x4094, 0x4094, 0x4094, 0x4094, 0x4095, 0x4095, 0x4095, 0x4096, 0x4096, 0x4096, 0x4096,
+    0x4097, 0x4097, 0x4097, 0x4097, 0x4098, 0x4098, 0x4098, 0x4098, 0x4099, 0x4099, 0x4099, 0x4099,
+    0x409A, 0x409A, 0x409A, 0x409A, 0x409B, 0x409B, 0x409B, 0x409B, 0x409C, 0x409C, 0x409C, 0x409C,
+    0x409D, 0x409D, 0x409D, 0x409D, 0x409E, 0x409E, 0x409E, 0x409E, 0x409F, 0x409F, 0x409F, 0x409F,
+    0x40A0, 0x40A0, 0x40A0, 0x40A1, 0x40A1, 0x40A1, 0x40A1, 0x40A2, 0x40A2, 0x40A2, 0x40A2, 0x40A3,
+    0x40A3, 0x40A3, 0x40A3, 0x40A4, 0x40A4, 0x40A4, 0x40A4, 0x40A5, 0x40A5, 0x40A5, 0x40A5, 0x40A6,
+    0x40A6, 0x40A6, 0x40A6, 0x40A7, 0x40A7, 0x40A7, 0x40A7, 0x40A8, 0x40A8, 0x40A8, 0x40A8, 0x40A9,
+    0x40A9, 0x40A9, 0x40A9, 0x40AA, 0x40AA, 0x40AA, 0x40AA, 0x40AB, 0x40AB, 0x40AB, 0x40AC, 0x40AC,
+    0x40AC, 0x40AC, 0x40AD, 0x40AD, 0x40AD, 0x40AD, 0x40AE, 0x40AE, 0x40AE, 0x40AE, 0x40AF, 0x40AF,
+    0x40AF, 0x40AF, 0x40B0, 0x40B0, 0x40B0, 0x40B0, 0x40B1, 0x40B1, 0x40B1, 0x40B1, 0x40B2, 0x40B2,
+    0x40B2, 0x40B2, 0x40B3, 0x40B3, 0x40B3, 0x40B3, 0x40B4, 0x40B4, 0x40B4, 0x40B4, 0x40B5, 0x40B5,
+    0x40B5, 0x40B6, 0x40B6, 0x40B6, 0x40B6, 0x40B7, 0x40B7, 0x40B7, 0x40B7, 0x40B8, 0x40B8, 0x40B8,
+    0x40B8, 0x40B9, 0x40B9, 0x40B9, 0x40B9, 0x40BA, 0x40BA, 0x40BA, 0x40BA, 0x40BB, 0x40BB, 0x40BB,
+    0x40BB, 0x40BC, 0x40BC, 0x40BC, 0x40BC, 0x40BD, 0x40BD, 0x40BD, 0x40BD, 0x40BE, 0x40BE, 0x40BE,
+    0x40BE, 0x40BF, 0x40BF, 0x40BF, 0x40BF, 0x40C0, 0x40C0, 0x40C0, 0x40C1, 0x40C1, 0x40C1, 0x40C1,
+    0x40C2, 0x40C2, 0x40C2, 0x40C2, 0x40C3, 0x40C3, 0x40C3, 0x40C3, 0x40C4, 0x40C4, 0x40C4, 0x40C4,
+    0x40C5, 0x40C5, 0x40C5, 0x40C5, 0x40C6, 0x40C6, 0x40C6, 0x40C6, 0x40C7, 0x40C7, 0x40C7, 0x40C7,
+    0x40C8, 0x40C8, 0x40C8, 0x40C8, 0x40C9, 0x40C9, 0x40C9, 0x40C9, 0x40CA, 0x40CA, 0x40CA, 0x40CA,
+    0x40CB, 0x40CB, 0x40CB, 0x40CC, 0x40CC, 0x40CC, 0x40CC, 0x40CD, 0x40CD, 0x40CD, 0x40CD, 0x40CE,
+    0x40CE, 0x40CE, 0x40CE, 0x40CF, 0x40CF, 0x40CF, 0x40CF, 0x40D0, 0x40D0, 0x40D0, 0x40D0, 0x40D1,
+    0x40D1, 0x40D1, 0x40D1, 0x40D2, 0x40D2, 0x40D2, 0x40D2, 0x40D3, 0x40D3, 0x40D3, 0x40D3, 0x40D4,
+    0x40D4, 0x40D4, 0x40D4, 0x40D5, 0x40D5, 0x40D5, 0x40D6, 0x40D6, 0x40D6, 0x40D6, 0x40D7, 0x40D7,
+    0x40D7, 0x40D7, 0x40D8, 0x40D8, 0x40D8, 0x40D8, 0x40D9, 0x40D9, 0x40D9, 0x40D9, 0x40DA, 0x40DA,
+    0x40DA, 0x40DA, 0x40DB, 0x40DB, 0x40DB, 0x40DB, 0x40DC, 0x40DC, 0x40DC, 0x40DC, 0x40DD, 0x40DD,
+    0x40DD, 0x40DD, 0x40DE, 0x40DE, 0x40DE, 0x40DE, 0x40DF, 0x40DF, 0x40DF, 0x40DF, 0x40E0, 0x40E0,
+    0x40E0, 0x40E1, 0x40E1, 0x40E1, 0x40E1, 0x40E2, 0x40E2, 0x40E2, 0x40E2, 0x40E3, 0x40E3, 0x40E3,
+    0x40E3, 0x40E4, 0x40E4, 0x40E4, 0x40E4, 0x40E5, 0x40E5, 0x40E5, 0x40E5, 0x40E6, 0x40E6, 0x40E6,
+    0x40E6, 0x40E7, 0x40E7, 0x40E7, 0x40E7, 0x40E8, 0x40E8, 0x40E8, 0x40E8, 0x40E9, 0x40E9, 0x40E9,
+    0x40E9, 0x40EA, 0x40EA, 0x40EA, 0x40EA, 0x40EB, 0x40EB, 0x40EB, 0x40EC, 0x40EC, 0x40EC, 0x40EC,
+    0x40ED, 0x40ED, 0x40ED, 0x40ED, 0x40EE, 0x40EE, 0x40EE, 0x40EE, 0x40EF, 0x40EF, 0x40EF, 0x40EF,
+    0x40F0, 0x40F0, 0x40F0, 0x40F0, 0x40F1, 0x40F1, 0x40F1, 0x40F1, 0x40F2, 0x40F2, 0x40F2, 0x40F2,
+    0x40F3, 0x40F3, 0x40F3, 0x40F3, 0x40F4, 0x40F4, 0x40F4, 0x40F4, 0x40F5, 0x40F5, 0x40F5, 0x40F6,
+    0x40F6, 0x40F6, 0x40F6, 0x40F7, 0x40F7, 0x40F7, 0x40F7, 0x40F8, 0x40F8, 0x40F8, 0x40F8, 0x40F9,
+    0x40F9, 0x40F9, 0x40F9, 0x40FA, 0x40FA, 0x40FA, 0x40FA, 0x40FB, 0x40FB, 0x40FB, 0x40FB, 0x40FC,
+    0x40FC, 0x40FC, 0x40FC, 0x40FD, 0x40FD, 0x40FD, 0x40FD, 0x40FE, 0x40FE, 0x40FE, 0x40FE, 0x40FF,
+    0x40FF, 0x40FF, 0x40FF, 0x4100, 0xBC03, 0xBC83, 0xBCC5, 0xBD03, 0xBD24, 0xBD45, 0xBD65, 0xBD83,
+    0xBD93, 0xBDA4, 0xBDB4, 0xBDC5, 0xBDD5, 0xBDE5, 0xBDF6, 0xBE03, 0xBE0B, 0xBE13, 0xBE1C, 0xBE24,
+    0xBE2C, 0xBE34, 0xBE3C, 0xBE45, 0xBE4D, 0xBE55, 0xBE5D, 0xBE65, 0xBE6E, 0xBE76, 0xBE7E, 0xBE83,
+    0xBE87, 0xBE8B, 0xBE8F, 0xBE93, 0xBE98, 0xBE9C, 0xBEA0, 0xBEA4, 0xBEA8, 0xBEAC, 0xBEB0, 0xBEB4,
+    0xBEB8, 0xBEBC, 0xBEC1, 0xBEC5, 0xBEC9, 0xBECD, 0xBED1, 0xBED5, 0xBED9, 0xBEDD, 0xBEE1, 0xBEE5,
+    0xBEE9, 0xBEEE, 0xBEF2, 0xBEF6, 0xBEFA, 0xBEFE, 0xBF01, 0xBF03, 0xBF05, 0xBF07, 0xBF09, 0xBF0B,
+    0xBF0D, 0xBF0F, 0xBF11, 0xBF13, 0xBF16, 0xBF18, 0xBF1A, 0xBF1C, 0xBF1E, 0xBF20, 0xBF22, 0xBF24,
+    0xBF26, 0xBF28, 0xBF2A, 0xBF2C, 0xBF2E, 0xBF30, 0xBF32, 0xBF34, 0xBF36, 0xBF38, 0xBF3A, 0xBF3C,
+    0xBF3E, 0xBF41, 0xBF43, 0xBF45, 0xBF47, 0xBF49, 0xBF4B, 0xBF4D, 0xBF4F, 0xBF51, 0xBF53, 0xBF55,
+    0xBF57, 0xBF59, 0xBF5B, 0xBF5D, 0xBF5F, 0xBF61, 0xBF63, 0xBF65, 0xBF67, 0xBF69, 0xBF6C, 0xBF6E,
+    0xBF70, 0xBF72, 0xBF74, 0xBF76, 0xBF78, 0xBF7A, 0xBF7C, 0xBF7E, 0xBF80, 0xBF81, 0xBF82, 0xBF83,
+    0xBF84, 0xBF85, 0xBF86, 0xBF87, 0xBF88, 0xBF89, 0xBF8A, 0xBF8B, 0xBF8C, 0xBF8D, 0xBF8E, 0xBF8F,
+    0xBF90, 0xBF91, 0xBF92, 0xBF93, 0xBF94, 0xBF96, 0xBF97, 0xBF98, 0xBF99, 0xBF9A, 0xBF9B, 0xBF9C,
+    0xBF9D, 0xBF9E, 0xBF9F, 0xBFA0, 0xBFA1, 0xBFA2, 0xBFA3, 0xBFA4, 0xBFA5, 0xBFA6, 0xBFA7, 0xBFA8,
+    0xBFA9, 0xBFAA, 0xBFAB, 0xBFAC, 0xBFAD, 0xBFAE, 0xBFAF, 0xBFB0, 0xBFB1, 0xBFB2, 0xBFB3, 0xBFB4,
+    0xBFB5, 0xBFB6, 0xBFB7, 0xBFB8, 0xBFB9, 0xBFBA, 0xBFBB, 0xBFBC, 0xBFBD, 0xBFBE, 0xBFBF, 0xBFC1,
+    0xBFC2, 0xBFC3, 0xBFC4, 0xBFC5, 0xBFC6, 0xBFC7, 0xBFC8, 0xBFC9, 0xBFCA, 0xBFCB, 0xBFCC, 0xBFCD,
+    0xBFCE, 0xBFCF, 0xBFD0, 0xBFD1, 0xBFD2, 0xBFD3, 0xBFD4, 0xBFD5, 0xBFD6, 0xBFD7, 0xBFD8, 0xBFD9,
+    0xBFDA, 0xBFDB, 0xBFDC, 0xBFDD, 0xBFDE, 0xBFDF, 0xBFE0, 0xBFE1, 0xBFE2, 0xBFE3, 0xBFE4, 0xBFE5,
+    0xBFE6, 0xBFE7, 0xBFE8, 0xBFE9, 0xBFEA, 0xBFEC, 0xBFED, 0xBFEE, 0xBFEF, 0xBFF0, 0xBFF1, 0xBFF2,
+    0xBFF3, 0xBFF4, 0xBFF5, 0xBFF6, 0xBFF7, 0xBFF8, 0xBFF9, 0xBFFA, 0xBFFB, 0xBFFC, 0xBFFD, 0xBFFE,
+    0xBFFF, 0xC000, 0xC001, 0xC001, 0xC002, 0xC002, 0xC003, 0xC003, 0xC004, 0xC004, 0xC005, 0xC005,
+    0xC006, 0xC006, 0xC007, 0xC007, 0xC008, 0xC008, 0xC009, 0xC009, 0xC00A, 0xC00A, 0xC00B, 0xC00B,
+    0xC00C, 0xC00C, 0xC00D, 0xC00D, 0xC00E, 0xC00E, 0xC00F, 0xC00F, 0xC010, 0xC010, 0xC011, 0xC011,
+    0xC012, 0xC012, 0xC013, 0xC013, 0xC014, 0xC014, 0xC015, 0xC016, 0xC016, 0xC017, 0xC017, 0xC018,
+    0xC018, 0xC019, 0xC019, 0xC01A, 0xC01A, 0xC01B, 0xC01B, 0xC01C, 0xC01C, 0xC01D, 0xC01D, 0xC01E,
+    0xC01E, 0xC01F, 0xC01F, 0xC020, 0xC020, 0xC021, 0xC021, 0xC022, 0xC022, 0xC023, 0xC023, 0xC024,
+    0xC024, 0xC025, 0xC025, 0xC026, 0xC026, 0xC027, 0xC027, 0xC028, 0xC028, 0xC029, 0xC029, 0xC02A,
+    0xC02A, 0xC02B, 0xC02C, 0xC02C, 0xC02D, 0xC02D, 0xC02E, 0xC02E, 0xC02F, 0xC02F, 0xC030, 0xC030,
+    0xC031, 0xC031, 0xC032, 0xC032, 0xC033, 0xC033, 0xC034, 0xC034, 0xC035, 0xC035, 0xC036, 0xC036,
+    0xC037, 0xC037, 0xC038, 0xC038, 0xC039, 0xC039, 0xC03A, 0xC03A, 0xC03B, 0xC03B, 0xC03C, 0xC03C,
+    0xC03D, 0xC03D, 0xC03E, 0xC03E, 0xC03F, 0xC03F, 0xC040, 0xC041, 0xC041, 0xC042, 0xC042, 0xC043,
+    0xC043, 0xC044, 0xC044, 0xC045, 0xC045, 0xC046, 0xC046, 0xC047, 0xC047, 0xC048, 0xC048, 0xC049,
+    0xC049, 0xC04A, 0xC04A, 0xC04B, 0xC04B, 0xC04C, 0xC04C, 0xC04D, 0xC04D, 0xC04E, 0xC04E, 0xC04F,
+    0xC04F, 0xC050, 0xC050, 0xC051, 0xC051, 0xC052, 0xC052, 0xC053, 0xC053, 0xC054, 0xC054, 0xC055,
+    0xC056, 0xC056, 0xC057, 0xC057, 0xC058, 0xC058, 0xC059, 0xC059, 0xC05A, 0xC05A, 0xC05B, 0xC05B,
+    0xC05C, 0xC05C, 0xC05D, 0xC05D, 0xC05E, 0xC05E, 0xC05F, 0xC05F, 0xC060, 0xC060, 0xC061, 0xC061,
+    0xC062, 0xC062, 0xC063, 0xC063, 0xC064, 0xC064, 0xC065, 0xC065, 0xC066, 0xC066, 0xC067, 0xC067,
+    0xC068, 0xC068, 0xC069, 0xC069, 0xC06A, 0xC06A, 0xC06B, 0xC06C, 0xC06C, 0xC06D, 0xC06D, 0xC06E,
+    0xC06E, 0xC06F, 0xC06F, 0xC070, 0xC070, 0xC071, 0xC071, 0xC072, 0xC072, 0xC073, 0xC073, 0xC074,
+    0xC074, 0xC075, 0xC075, 0xC076, 0xC076, 0xC077, 0xC077, 0xC078, 0xC078, 0xC079, 0xC079, 0xC07A,
+    0xC07A, 0xC07B, 0xC07B, 0xC07C, 0xC07C, 0xC07D, 0xC07D, 0xC07E, 0xC07E, 0xC07F, 0xC07F, 0xC080,
+    0xC080, 0xC081, 0xC081, 0xC081, 0xC081, 0xC082, 0xC082, 0xC082, 0xC082, 0xC083, 0xC083, 0xC083,
+    0xC083, 0xC084, 0xC084, 0xC084, 0xC084, 0xC085, 0xC085, 0xC085, 0xC085, 0xC086, 0xC086, 0xC086,
+    0xC086, 0xC087, 0xC087, 0xC087, 0xC087, 0xC088, 0xC088, 0xC088, 0xC088, 0xC089, 0xC089, 0xC089,
+    0xC089, 0xC08A, 0xC08A, 0xC08A, 0xC08A, 0xC08B, 0xC08B, 0xC08B, 0xC08C, 0xC08C, 0xC08C, 0xC08C,
+    0xC08D, 0xC08D, 0xC08D, 0xC08D, 0xC08E, 0xC08E, 0xC08E, 0xC08E, 0xC08F, 0xC08F, 0xC08F, 0xC08F,
+    0xC090, 0xC090, 0xC090, 0xC090, 0xC091, 0xC091, 0xC091, 0xC091, 0xC092, 0xC092, 0xC092, 0xC092,
+    0xC093, 0xC093, 0xC093, 0xC093, 0xC094, 0xC094, 0xC094, 0xC094, 0xC095, 0xC095, 0xC095, 0xC096,
+    0xC096, 0xC096, 0xC096, 0xC097, 0xC097, 0xC097, 0xC097, 0xC098, 0xC098, 0xC098, 0xC098, 0xC099,
+    0xC099, 0xC099, 0xC099, 0xC09A, 0xC09A, 0xC09A, 0xC09A, 0xC09B, 0xC09B, 0xC09B, 0xC09B, 0xC09C,
+    0xC09C, 0xC09C, 0xC09C, 0xC09D, 0xC09D, 0xC09D, 0xC09D, 0xC09E, 0xC09E, 0xC09E, 0xC09E, 0xC09F,
+    0xC09F, 0xC09F, 0xC09F, 0xC0A0, 0xC0A0, 0xC0A0, 0xC0A1, 0xC0A1, 0xC0A1, 0xC0A1, 0xC0A2, 0xC0A2,
+    0xC0A2, 0xC0A2, 0xC0A3, 0xC0A3, 0xC0A3, 0xC0A3, 0xC0A4, 0xC0A4, 0xC0A4, 0xC0A4, 0xC0A5, 0xC0A5,
+    0xC0A5, 0xC0A5, 0xC0A6, 0xC0A6, 0xC0A6, 0xC0A6, 0xC0A7, 0xC0A7, 0xC0A7, 0xC0A7, 0xC0A8, 0xC0A8,
+    0xC0A8, 0xC0A8, 0xC0A9, 0xC0A9, 0xC0A9, 0xC0A9, 0xC0AA, 0xC0AA, 0xC0AA, 0xC0AA, 0xC0AB, 0xC0AB,
+    0xC0AB, 0xC0AC, 0xC0AC, 0xC0AC, 0xC0AC, 0xC0AD, 0xC0AD, 0xC0AD, 0xC0AD, 0xC0AE, 0xC0AE, 0xC0AE,
+    0xC0AE, 0xC0AF, 0xC0AF, 0xC0AF, 0xC0AF, 0xC0B0, 0xC0B0, 0xC0B0, 0xC0B0, 0xC0B1, 0xC0B1, 0xC0B1,
+    0xC0B1, 0xC0B2, 0xC0B2, 0xC0B2, 0xC0B2, 0xC0B3, 0xC0B3, 0xC0B3, 0xC0B3, 0xC0B4, 0xC0B4, 0xC0B4,
+    0xC0B4, 0xC0B5, 0xC0B5, 0xC0B5, 0xC0B6, 0xC0B6, 0xC0B6, 0xC0B6, 0xC0B7, 0xC0B7, 0xC0B7, 0xC0B7,
+    0xC0B8, 0xC0B8, 0xC0B8, 0xC0B8, 0xC0B9, 0xC0B9, 0xC0B9, 0xC0B9, 0xC0BA, 0xC0BA, 0xC0BA, 0xC0BA,
+    0xC0BB, 0xC0BB, 0xC0BB, 0xC0BB, 0xC0BC, 0xC0BC, 0xC0BC, 0xC0BC, 0xC0BD, 0xC0BD, 0xC0BD, 0xC0BD,
+    0xC0BE, 0xC0BE, 0xC0BE, 0xC0BE, 0xC0BF, 0xC0BF, 0xC0BF, 0xC0BF, 0xC0C0, 0xC0C0, 0xC0C0, 0xC0C1,
+    0xC0C1, 0xC0C1, 0xC0C1, 0xC0C2, 0xC0C2, 0xC0C2, 0xC0C2, 0xC0C3, 0xC0C3, 0xC0C3, 0xC0C3, 0xC0C4,
+    0xC0C4, 0xC0C4, 0xC0C4, 0xC0C5, 0xC0C5, 0xC0C5, 0xC0C5, 0xC0C6, 0xC0C6, 0xC0C6, 0xC0C6, 0xC0C7,
+    0xC0C7, 0xC0C7, 0xC0C7, 0xC0C8, 0xC0C8, 0xC0C8, 0xC0C8, 0xC0C9, 0xC0C9, 0xC0C9, 0xC0C9, 0xC0CA,
+    0xC0CA, 0xC0CA, 0xC0CA, 0xC0CB, 0xC0CB, 0xC0CB, 0xC0CC, 0xC0CC, 0xC0CC, 0xC0CC, 0xC0CD, 0xC0CD,
+    0xC0CD, 0xC0CD, 0xC0CE, 0xC0CE, 0xC0CE, 0xC0CE, 0xC0CF, 0xC0CF, 0xC0CF, 0xC0CF, 0xC0D0, 0xC0D0,
+    0xC0D0, 0xC0D0, 0xC0D1, 0xC0D1, 0xC0D1, 0xC0D1, 0xC0D2, 0xC0D2, 0xC0D2, 0xC0D2, 0xC0D3, 0xC0D3,
+    0xC0D3, 0xC0D3, 0xC0D4, 0xC0D4, 0xC0D4, 0xC0D4, 0xC0D5, 0xC0D5, 0xC0D5, 0xC0D6, 0xC0D6, 0xC0D6,
+    0xC0D6, 0xC0D7, 0xC0D7, 0xC0D7, 0xC0D7, 0xC0D8, 0xC0D8, 0xC0D8, 0xC0D8, 0xC0D9, 0xC0D9, 0xC0D9,
+    0xC0D9, 0xC0DA, 0xC0DA, 0xC0DA, 0xC0DA, 0xC0DB, 0xC0DB, 0xC0DB, 0xC0DB, 0xC0DC, 0xC0DC, 0xC0DC,
+    0xC0DC, 0xC0DD, 0xC0DD, 0xC0DD, 0xC0DD, 0xC0DE, 0xC0DE, 0xC0DE, 0xC0DE, 0xC0DF, 0xC0DF, 0xC0DF,
+    0xC0DF, 0xC0E0, 0xC0E0, 0xC0E0, 0xC0E1, 0xC0E1, 0xC0E1, 0xC0E1, 0xC0E2, 0xC0E2, 0xC0E2, 0xC0E2,
+    0xC0E3, 0xC0E3, 0xC0E3, 0xC0E3, 0xC0E4, 0xC0E4, 0xC0E4, 0xC0E4, 0xC0E5, 0xC0E5, 0xC0E5, 0xC0E5,
+    0xC0E6, 0xC0E6, 0xC0E6, 0xC0E6, 0xC0E7, 0xC0E7, 0xC0E7, 0xC0E7, 0xC0E8, 0xC0E8, 0xC0E8, 0xC0E8,
+    0xC0E9, 0xC0E9, 0xC0E9, 0xC0E9, 0xC0EA, 0xC0EA, 0xC0EA, 0xC0EA, 0xC0EB, 0xC0EB, 0xC0EB, 0xC0EC,
+    0xC0EC, 0xC0EC, 0xC0EC, 0xC0ED, 0xC0ED, 0xC0ED, 0xC0ED, 0xC0EE, 0xC0EE, 0xC0EE, 0xC0EE, 0xC0EF,
+    0xC0EF, 0xC0EF, 0xC0EF, 0xC0F0, 0xC0F0, 0xC0F0, 0xC0F0, 0xC0F1, 0xC0F1, 0xC0F1, 0xC0F1, 0xC0F2,
+    0xC0F2, 0xC0F2, 0xC0F2, 0xC0F3, 0xC0F3, 0xC0F3, 0xC0F3, 0xC0F4, 0xC0F4, 0xC0F4, 0xC0F4, 0xC0F5,
+    0xC0F5, 0xC0F5, 0xC0F6, 0xC0F6, 0xC0F6, 0xC0F6, 0xC0F7, 0xC0F7, 0xC0F7, 0xC0F7, 0xC0F8, 0xC0F8,
+    0xC0F8, 0xC0F8, 0xC0F9, 0xC0F9, 0xC0F9, 0xC0F9, 0xC0FA, 0xC0FA, 0xC0FA, 0xC0FA, 0xC0FB, 0xC0FB,
+    0xC0FB, 0xC0FB, 0xC0FC, 0xC0FC, 0xC0FC, 0xC0FC, 0xC0FD, 0xC0FD, 0xC0FD, 0xC0FD, 0xC0FE, 0xC0FE,
+    0xC0FE, 0xC0FE, 0xC0FF, 0xC0FF, 0xC0FF, 0xC0FF, 0xC100, 0xC100,
+};
+
+static uint16_t sigmode_golden_bf16[] = {
+    0x3f00, 0x3f01, 0x3f01, 0x3f02, 0x3f02, 0x3f03, 0x3f03, 0x3f04, 0x3f04, 0x3f05, 0x3f05, 0x3f06,
+    0x3f06, 0x3f07, 0x3f07, 0x3f08, 0x3f08, 0x3f09, 0x3f09, 0x3f0a, 0x3f0a, 0x3f0b, 0x3f0b, 0x3f0c,
+    0x3f0c, 0x3f0d, 0x3f0d, 0x3f0e, 0x3f0e, 0x3f0f, 0x3f0f, 0x3f10, 0x3f10, 0x3f11, 0x3f11, 0x3f12,
+    0x3f12, 0x3f13, 0x3f13, 0x3f14, 0x3f14, 0x3f15, 0x3f15, 0x3f16, 0x3f16, 0x3f17, 0x3f17, 0x3f18,
+    0x3f19, 0x3f19, 0x3f1a, 0x3f1a, 0x3f1b, 0x3f1b, 0x3f1b, 0x3f1c, 0x3f1d, 0x3f1d, 0x3f1e, 0x3f1e,
+    0x3f1f, 0x3f1f, 0x3f20, 0x3f1f, 0x3f20, 0x3f20, 0x3f21, 0x3f21, 0x3f22, 0x3f22, 0x3f23, 0x3f23,
+    0x3f24, 0x3f24, 0x3f25, 0x3f25, 0x3f26, 0x3f26, 0x3f27, 0x3f27, 0x3f28, 0x3f28, 0x3f29, 0x3f29,
+    0x3f2a, 0x3f2a, 0x3f2a, 0x3f2a, 0x3f2b, 0x3f2b, 0x3f2c, 0x3f2c, 0x3f2d, 0x3f2d, 0x3f2e, 0x3f2f,
+    0x3f2f, 0x3f30, 0x3f30, 0x3f30, 0x3f31, 0x3f31, 0x3f31, 0x3f32, 0x3f32, 0x3f32, 0x3f33, 0x3f33,
+    0x3f34, 0x3f34, 0x3f35, 0x3f36, 0x3f36, 0x3f36, 0x3f37, 0x3f37, 0x3f38, 0x3f38, 0x3f38, 0x3f39,
+    0x3f39, 0x3f3a, 0x3f3a, 0x3f3a, 0x3f3b, 0x3f3b, 0x3f3b, 0x3f3c, 0x3f3c, 0x3f3d, 0x3f3d, 0x3f3d,
+    0x3f3e, 0x3f3e, 0x3f3e, 0x3f3f, 0x3f3f, 0x3f40, 0x3f40, 0x3f40, 0x3f41, 0x3f41, 0x3f41, 0x3f42,
+    0x3f42, 0x3f42, 0x3f43, 0x3f44, 0x3f44, 0x3f44, 0x3f45, 0x3f45, 0x3f45, 0x3f46, 0x3f46, 0x3f46,
+    0x3f47, 0x3f47, 0x3f48, 0x3f48, 0x3f48, 0x3f49, 0x3f49, 0x3f49, 0x3f4a, 0x3f4a, 0x3f4b, 0x3f4b,
+    0x3f4b, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4c, 0x3f4d, 0x3f4d, 0x3f4d, 0x3f4e, 0x3f4e, 0x3f4e,
+    0x3f4f, 0x3f4f, 0x3f50, 0x3f50, 0x3f50, 0x3f51, 0x3f51, 0x3f51, 0x3f51, 0x3f52, 0x3f52, 0x3f52,
+    0x3f52, 0x3f53, 0x3f53, 0x3f54, 0x3f54, 0x3f55, 0x3f55, 0x3f55, 0x3f55, 0x3f56, 0x3f56, 0x3f56,
+    0x3f56, 0x3f57, 0x3f57, 0x3f57, 0x3f57, 0x3f58, 0x3f58, 0x3f58, 0x3f58, 0x3f59, 0x3f59, 0x3f59,
+    0x3f59, 0x3f5a, 0x3f5a, 0x3f5a, 0x3f5a, 0x3f5a, 0x3f5b, 0x3f5b, 0x3f5b, 0x3f5b, 0x3f5c, 0x3f5c,
+    0x3f5c, 0x3f5c, 0x3f5d, 0x3f5d, 0x3f5d, 0x3f5e, 0x3f5e, 0x3f5e, 0x3f5e, 0x3f5f, 0x3f5f, 0x3f5f,
+    0x3f5f, 0x3f60, 0x3f60, 0x3f60, 0x3f60, 0x3f61, 0x3f61, 0x3f61, 0x3f61, 0x3f62, 0x3f61, 0x3f61,
+    0x3f61, 0x3f62, 0x3f62, 0x3f62, 0x3f62, 0x3f63, 0x3f63, 0x3f63, 0x3f63, 0x3f64, 0x3f64, 0x3f64,
+    0x3f64, 0x3f65, 0x3f65, 0x3f65, 0x3f65, 0x3f66, 0x3f66, 0x3f66, 0x3f66, 0x3f66, 0x3f66, 0x3f66,
+    0x3f66, 0x3f67, 0x3f67, 0x3f67, 0x3f67, 0x3f68, 0x3f68, 0x3f68, 0x3f68, 0x3f69, 0x3f69, 0x3f69,
+    0x3f69, 0x3f69, 0x3f69, 0x3f69, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a, 0x3f6a,
+    0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6b, 0x3f6c, 0x3f6c, 0x3f6c, 0x3f6c,
+    0x3f6d, 0x3f6d, 0x3f6d, 0x3f6d, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e, 0x3f6e,
+    0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f6f, 0x3f70, 0x3f70,
+    0x3f70, 0x3f70, 0x3f70, 0x3f70, 0x3f70, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71,
+    0x3f71, 0x3f72, 0x3f72, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f71, 0x3f72, 0x3f72, 0x3f72,
+    0x3f72, 0x3f72, 0x3f72, 0x3f72, 0x3f72, 0x3f73, 0x3f73, 0x3f73, 0x3f73, 0x3f73, 0x3f73, 0x3f73,
+    0x3f73, 0x3f74, 0x3f74, 0x3f74, 0x3f74, 0x3f74, 0x3f74, 0x3f74, 0x3f75, 0x3f75, 0x3f75, 0x3f75,
+    0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75, 0x3f75,
+    0x3f75, 0x3f75, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76,
+    0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f76, 0x3f77, 0x3f77, 0x3f77, 0x3f77, 0x3f77, 0x3f77,
+    0x3f77, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78,
+    0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78, 0x3f78,
+    0x3f78, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f79,
+    0x3f79, 0x3f79, 0x3f79, 0x3f79, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a,
+    0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a,
+    0x3f7a, 0x3f7a, 0x3f7a, 0x3f7a, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b,
+    0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b, 0x3f7b,
+    0x3f7b, 0x3f7b, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c,
+    0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c,
+    0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7c, 0x3f7d, 0x3f7d,
+    0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d,
+    0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d,
+    0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d, 0x3f7d,
+    0x3f7d, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
+    0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
+    0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
+    0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
+    0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e,
+    0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7e, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f, 0x3f7f,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3f80,
+    0x3f80, 0x3f80, 0x3f80, 0x3f80, 0x3eff, 0x3efe, 0x3efd, 0x3efc, 0x3efb, 0x3efa, 0x3ef9, 0x3ef8,
+    0x3ef7, 0x3ef6, 0x3ef5, 0x3ef4, 0x3ef3, 0x3ef2, 0x3ef1, 0x3ef0, 0x3eef, 0x3eee, 0x3eed, 0x3eec,
+    0x3eeb, 0x3eea, 0x3ee9, 0x3ee7, 0x3ee6, 0x3ee5, 0x3ee4, 0x3ee3, 0x3ee2, 0x3ee1, 0x3ee0, 0x3edf,
+    0x3ede, 0x3edd, 0x3edc, 0x3edb, 0x3eda, 0x3ed9, 0x3ed8, 0x3ed7, 0x3ed6, 0x3ed5, 0x3ed4, 0x3ed3,
+    0x3ed2, 0x3ed1, 0x3ed1, 0x3ed0, 0x3ecf, 0x3ece, 0x3ecd, 0x3ecc, 0x3ecb, 0x3eca, 0x3ec9, 0x3ec8,
+    0x3ec7, 0x3ec6, 0x3ec5, 0x3ec4, 0x3ec3, 0x3ec2, 0x3ec1, 0x3ec0, 0x3ebf, 0x3ebe, 0x3ebd, 0x3ebc,
+    0x3ebb, 0x3eba, 0x3eba, 0x3eb9, 0x3eb7, 0x3eb6, 0x3eb5, 0x3eb4, 0x3eb4, 0x3eb3, 0x3eb2, 0x3eb1,
+    0x3eb0, 0x3eaf, 0x3eaf, 0x3eae, 0x3ead, 0x3eab, 0x3eaa, 0x3ea9, 0x3ea8, 0x3ea7, 0x3ea7, 0x3ea6,
+    0x3ea5, 0x3ea4, 0x3ea3, 0x3ea2, 0x3ea1, 0x3ea0, 0x3e9f, 0x3e9e, 0x3e9e, 0x3e9d, 0x3e9c, 0x3e9b,
+    0x3e9a, 0x3e99, 0x3e98, 0x3e98, 0x3e97, 0x3e97, 0x3e96, 0x3e95, 0x3e94, 0x3e93, 0x3e92, 0x3e91,
+    0x3e90, 0x3e8f, 0x3e8e, 0x3e8e, 0x3e8d, 0x3e8c, 0x3e8b, 0x3e8a, 0x3e8a, 0x3e89, 0x3e88, 0x3e88,
+    0x3e87, 0x3e86, 0x3e85, 0x3e85, 0x3e83, 0x3e82, 0x3e82, 0x3e81, 0x3e80, 0x3e7e, 0x3e7d, 0x3e7c,
+    0x3e7b, 0x3e7a, 0x3e78, 0x3e77, 0x3e75, 0x3e72, 0x3e71, 0x3e6f, 0x3e6e, 0x3e6c, 0x3e6b, 0x3e69,
+    0x3e68, 0x3e67, 0x3e65, 0x3e64, 0x3e63, 0x3e61, 0x3e60, 0x3e5f, 0x3e5d, 0x3e5c, 0x3e5a, 0x3e59,
+    0x3e58, 0x3e56, 0x3e55, 0x3e54, 0x3e52, 0x3e51, 0x3e50, 0x3e4f, 0x3e4e, 0x3e4c, 0x3e4b, 0x3e4a,
+    0x3e49, 0x3e47, 0x3e46, 0x3e45, 0x3e44, 0x3e43, 0x3e41, 0x3e40, 0x3e3f, 0x3e3e, 0x3e3c, 0x3e3a,
+    0x3e39, 0x3e37, 0x3e36, 0x3e35, 0x3e34, 0x3e33, 0x3e31, 0x3e30, 0x3e2f, 0x3e2e, 0x3e2c, 0x3e2b,
+    0x3e2a, 0x3e29, 0x3e28, 0x3e27, 0x3e26, 0x3e25, 0x3e24, 0x3e23, 0x3e22, 0x3e20, 0x3e20, 0x3e1f,
+    0x3e1e, 0x3e1d, 0x3e1c, 0x3e1b, 0x3e1a, 0x3e19, 0x3e18, 0x3e17, 0x3e16, 0x3e15, 0x3e14, 0x3e13,
+    0x3e12, 0x3e11, 0x3e10, 0x3e0f, 0x3e0e, 0x3e0c, 0x3e0b, 0x3e0a, 0x3e09, 0x3e08, 0x3e07, 0x3e06,
+    0x3e05, 0x3e04, 0x3e03, 0x3e03, 0x3e02, 0x3e01, 0x3e00, 0x3dff, 0x3dfd, 0x3dfb, 0x3df9, 0x3df8,
+    0x3df6, 0x3df4, 0x3df1, 0x3df1, 0x3ded, 0x3ded, 0x3dea, 0x3dea, 0x3de7, 0x3de7, 0x3de4, 0x3de4,
+    0x3de1, 0x3de1, 0x3dde, 0x3dde, 0x3ddb, 0x3ddb, 0x3dd8, 0x3dd8, 0x3dd5, 0x3dd5, 0x3dd2, 0x3dd2,
+    0x3dcf, 0x3dcf, 0x3dcc, 0x3dcc, 0x3dc9, 0x3dc9, 0x3dc7, 0x3dc7, 0x3dc3, 0x3dc3, 0x3dc0, 0x3dc0,
+    0x3dbe, 0x3dbe, 0x3dbb, 0x3dbb, 0x3db9, 0x3db9, 0x3db6, 0x3db4, 0x3db4, 0x3db1, 0x3db1, 0x3dae,
+    0x3dae, 0x3dac, 0x3dac, 0x3da9, 0x3da9, 0x3da7, 0x3da7, 0x3da5, 0x3da5, 0x3da3, 0x3da3, 0x3da0,
+    0x3da0, 0x3d9e, 0x3d9e, 0x3d9b, 0x3d9b, 0x3d99, 0x3d99, 0x3d97, 0x3d97, 0x3d94, 0x3d94, 0x3d93,
+    0x3d93, 0x3d91, 0x3d91, 0x3d8f, 0x3d8f, 0x3d8d, 0x3d8d, 0x3d8a, 0x3d8a, 0x3d88, 0x3d88, 0x3d86,
+    0x3d86, 0x3d84, 0x3d82, 0x3d82, 0x3d80, 0x3d80, 0x3d7d, 0x3d7d, 0x3d79, 0x3d79, 0x3d76, 0x3d76,
+    0x3d72, 0x3d72, 0x3d6f, 0x3d6f, 0x3d6b, 0x3d6b, 0x3d68, 0x3d68, 0x3d65, 0x3d65, 0x3d61, 0x3d61,
+    0x3d5e, 0x3d5e, 0x3d5b, 0x3d5b, 0x3d58, 0x3d58, 0x3d55, 0x3d55, 0x3d52, 0x3d52, 0x3d4e, 0x3d4e,
+    0x3d4b, 0x3d4b, 0x3d48, 0x3d48, 0x3d45, 0x3d45, 0x3d42, 0x3d3f, 0x3d3f, 0x3d3c, 0x3d3c, 0x3d3a,
+    0x3d3a, 0x3d37, 0x3d37, 0x3d34, 0x3d34, 0x3d32, 0x3d32, 0x3d2f, 0x3d2f, 0x3d2c, 0x3d2c, 0x3d2a,
+    0x3d2a, 0x3d27, 0x3d27, 0x3d24, 0x3d24, 0x3d22, 0x3d22, 0x3d20, 0x3d20, 0x3d1d, 0x3d1d, 0x3d1b,
+    0x3d1b, 0x3d19, 0x3d19, 0x3d17, 0x3d17, 0x3d15, 0x3d15, 0x3d12, 0x3d12, 0x3d10, 0x3d10, 0x3d0e,
+    0x3d0c, 0x3d0c, 0x3d0a, 0x3d0a, 0x3d08, 0x3d08, 0x3d06, 0x3d06, 0x3d04, 0x3d04, 0x3d02, 0x3d02,
+    0x3cff, 0x3cff, 0x3cfb, 0x3cfb, 0x3cf8, 0x3cf8, 0x3cf4, 0x3cf4, 0x3cf0, 0x3cf0, 0x3cec, 0x3cec,
+    0x3ce9, 0x3ce9, 0x3ce5, 0x3ce5, 0x3ce2, 0x3ce2, 0x3cdf, 0x3cdf, 0x3cdb, 0x3cdb, 0x3cd8, 0x3cd8,
+    0x3cd5, 0x3cd5, 0x3cd2, 0x3cd2, 0x3ccf, 0x3ccf, 0x3ccc, 0x3cc8, 0x3cc8, 0x3cc5, 0x3cc5, 0x3cc2,
+    0x3cc2, 0x3cbf, 0x3cbf, 0x3cbc, 0x3cbc, 0x3cb9, 0x3cb9, 0x3cb6, 0x3cb6, 0x3cb4, 0x3cb4, 0x3cb1,
+    0x3cb1, 0x3cae, 0x3cae, 0x3cac, 0x3cac, 0x3ca9, 0x3ca9, 0x3ca7, 0x3ca7, 0x3ca5, 0x3ca5, 0x3ca2,
+    0x3ca2, 0x3ca0, 0x3ca0, 0x3c9d, 0x3c9d, 0x3c9b, 0x3c9b, 0x3c98, 0x3c98, 0x3c96, 0x3c96, 0x3c93,
+    0x3c93, 0x3c8f, 0x3c8f, 0x3c8f, 0x3c8f, 0x3c8b, 0x3c8b, 0x3c8b, 0x3c8b, 0x3c87, 0x3c87, 0x3c87,
+    0x3c87, 0x3c82, 0x3c82, 0x3c82, 0x3c82, 0x3c7c, 0x3c7c, 0x3c7c, 0x3c7c, 0x3c75, 0x3c75, 0x3c75,
+    0x3c75, 0x3c6e, 0x3c6e, 0x3c6e, 0x3c6e, 0x3c66, 0x3c66, 0x3c66, 0x3c66, 0x3c5f, 0x3c5f, 0x3c5f,
+    0x3c5f, 0x3c59, 0x3c59, 0x3c59, 0x3c59, 0x3c53, 0x3c53, 0x3c53, 0x3c4c, 0x3c4c, 0x3c4c, 0x3c4c,
+    0x3c46, 0x3c46, 0x3c46, 0x3c46, 0x3c3f, 0x3c3f, 0x3c3f, 0x3c3f, 0x3c39, 0x3c39, 0x3c39, 0x3c39,
+    0x3c34, 0x3c34, 0x3c34, 0x3c34, 0x3c2f, 0x3c2f, 0x3c2f, 0x3c2f, 0x3c29, 0x3c29, 0x3c29, 0x3c29,
+    0x3c24, 0x3c24, 0x3c24, 0x3c24, 0x3c1f, 0x3c1f, 0x3c1f, 0x3c1f, 0x3c1a, 0x3c1a, 0x3c1a, 0x3c16,
+    0x3c16, 0x3c16, 0x3c16, 0x3c12, 0x3c12, 0x3c12, 0x3c12, 0x3c0d, 0x3c0d, 0x3c0d, 0x3c0d, 0x3c09,
+    0x3c09, 0x3c09, 0x3c09, 0x3c04, 0x3c04, 0x3c04, 0x3c04, 0x3c00, 0x3c00, 0x3c00, 0x3c00, 0x3bf8,
+    0x3bf8, 0x3bf8, 0x3bf8, 0x3bf1, 0x3bf1, 0x3bf1, 0x3bf1, 0x3be9, 0x3be9, 0x3be9, 0x3be9, 0x3be2,
+    0x3be2, 0x3be2, 0x3be2, 0x3bdb, 0x3bdb, 0x3bdb, 0x3bd4, 0x3bd4, 0x3bd4, 0x3bd4, 0x3bce, 0x3bce,
+    0x3bce, 0x3bce, 0x3bc8, 0x3bc8, 0x3bc8, 0x3bc8, 0x3bc2, 0x3bc2, 0x3bc2, 0x3bc2, 0x3bbc, 0x3bbc,
+    0x3bbc, 0x3bbc, 0x3bb6, 0x3bb6, 0x3bb6, 0x3bb6, 0x3bb0, 0x3bb0, 0x3bb0, 0x3bb0, 0x3bab, 0x3bab,
+    0x3bab, 0x3bab, 0x3ba6, 0x3ba6, 0x3ba6, 0x3ba6, 0x3ba1, 0x3ba1, 0x3ba1, 0x3ba1, 0x3b9c, 0x3b9c,
+    0x3b9c, 0x3b97, 0x3b97, 0x3b97, 0x3b97, 0x3b92, 0x3b92, 0x3b92, 0x3b92, 0x3b8e, 0x3b8e, 0x3b8e,
+    0x3b8e, 0x3b8a, 0x3b8a, 0x3b8a, 0x3b8a, 0x3b85, 0x3b85, 0x3b85, 0x3b85, 0x3b81, 0x3b81, 0x3b81,
+    0x3b81, 0x3b7b, 0x3b7b, 0x3b7b, 0x3b7b, 0x3b73, 0x3b73, 0x3b73, 0x3b73, 0x3b6c, 0x3b6c, 0x3b6c,
+    0x3b6c, 0x3b65, 0x3b65, 0x3b65, 0x3b5d, 0x3b5d, 0x3b5d, 0x3b5d, 0x3b56, 0x3b56, 0x3b56, 0x3b56,
+    0x3b50, 0x3b50, 0x3b50, 0x3b50, 0x3b4a, 0x3b4a, 0x3b4a, 0x3b4a, 0x3b43, 0x3b43, 0x3b43, 0x3b43,
+    0x3b3d, 0x3b3d, 0x3b3d, 0x3b3d, 0x3b38, 0x3b38, 0x3b38, 0x3b38, 0x3b32, 0x3b32, 0x3b32, 0x3b32,
+    0x3b2c, 0x3b2c, 0x3b2c, 0x3b2c, 0x3b27, 0x3b27, 0x3b27, 0x3b27, 0x3b22, 0x3b22, 0x3b22, 0x3b1d,
+    0x3b1d, 0x3b1d, 0x3b1d, 0x3b18, 0x3b18, 0x3b18, 0x3b18, 0x3b13, 0x3b13, 0x3b13, 0x3b13, 0x3b0f,
+    0x3b0f, 0x3b0f, 0x3b0f, 0x3b0b, 0x3b0b, 0x3b0b, 0x3b0b, 0x3b06, 0x3b06, 0x3b06, 0x3b06, 0x3b02,
+    0x3b02, 0x3b02, 0x3b02, 0x3afd, 0x3afd, 0x3afd, 0x3afd, 0x3af5, 0x3af5, 0x3af5, 0x3af5, 0x3aed,
+    0x3aed, 0x3aed, 0x3aed, 0x3ae6, 0x3ae6, 0x3ae6, 0x3adf, 0x3adf, 0x3adf, 0x3adf, 0x3ad8, 0x3ad8,
+    0x3ad8, 0x3ad8, 0x3ad1, 0x3ad1, 0x3ad1, 0x3ad1, 0x3acb, 0x3acb, 0x3acb, 0x3acb, 0x3ac5, 0x3ac5,
+    0x3ac5, 0x3ac5, 0x3abf, 0x3abf, 0x3abf, 0x3abf, 0x3ab9, 0x3ab9, 0x3ab9, 0x3ab9, 0x3ab3, 0x3ab3,
+    0x3ab3, 0x3ab3, 0x3aae, 0x3aae, 0x3aae, 0x3aae, 0x3aa9, 0x3aa9, 0x3aa9, 0x3aa3, 0x3aa3, 0x3aa3,
+    0x3aa3, 0x3a9e, 0x3a9e, 0x3a9e, 0x3a9e, 0x3a99, 0x3a99, 0x3a99, 0x3a99, 0x3a94, 0x3a94, 0x3a94,
+    0x3a94, 0x3a90, 0x3a90, 0x3a90, 0x3a90, 0x3a8c, 0x3a8c, 0x3a8c, 0x3a8c, 0x3a87, 0x3a87, 0x3a87,
+    0x3a87, 0x3a83, 0x3a83, 0x3a83, 0x3a83, 0x3a7e, 0x3a7e, 0x3a7e, 0x3a7e, 0x3a76, 0x3a76, 0x3a76,
+    0x3a76, 0x3a6f, 0x3a6f, 0x3a6f, 0x3a68, 0x3a68, 0x3a68, 0x3a68, 0x3a60, 0x3a60, 0x3a60, 0x3a60,
+    0x3a59, 0x3a59, 0x3a59, 0x3a59, 0x3a53, 0x3a53, 0x3a53, 0x3a53, 0x3a4d, 0x3a4d, 0x3a4d, 0x3a4d,
+    0x3a46, 0x3a46, 0x3a46, 0x3a46, 0x3a40, 0x3a40, 0x3a40, 0x3a40, 0x3a3a, 0x3a3a, 0x3a3a, 0x3a3a,
+    0x3a34, 0x3a34, 0x3a34, 0x3a34, 0x3a2f, 0x3a2f, 0x3a2f, 0x3a2f, 0x3a2a, 0x3a2a, 0x3a2a, 0x3a24,
+    0x3a24, 0x3a24, 0x3a24, 0x3a1f, 0x3a1f, 0x3a1f, 0x3a1f, 0x3a1a, 0x3a1a, 0x3a1a, 0x3a1a, 0x3a15,
+    0x3a15, 0x3a15, 0x3a15, 0x3a11, 0x3a11, 0x3a11, 0x3a11, 0x3a0d, 0x3a0d, 0x3a0d, 0x3a0d, 0x3a08,
+    0x3a08, 0x3a08, 0x3a08, 0x3a04, 0x3a04, 0x3a04, 0x3a04, 0x3a00, 0x3a00, 0x3a00, 0x3a00, 0x39f8,
+    0x39f8, 0x39f8, 0x39f0, 0x39f0, 0x39f0, 0x39f0, 0x39e9, 0x39e9, 0x39e9, 0x39e9, 0x39e2, 0x39e2,
+    0x39e2, 0x39e2, 0x39db, 0x39db, 0x39db, 0x39db, 0x39d4, 0x39d4, 0x39d4, 0x39d4, 0x39ce, 0x39ce,
+    0x39ce, 0x39ce, 0x39c7, 0x39c7, 0x39c7, 0x39c7, 0x39c1, 0x39c1, 0x39c1, 0x39c1, 0x39bb, 0x39bb,
+    0x39bb, 0x39bb, 0x39b5, 0x39b5, 0x39b5, 0x39b5, 0x39b0, 0x39b0,
+};
+
+// FIXME: not hard code
+// contribute from hw, fix with `PRE_DATA` input
+static double sigmode_golden[] = {
+    0.5,         0.501999989, 0.503999915, 0.505999712, 0.507999317, 0.509998667, 0.511997697,
+    0.513996342, 0.515994541, 0.517992228, 0.51998934,  0.521985814, 0.523981585, 0.525976591,
+    0.527970767, 0.529964052, 0.531956381, 0.533947691, 0.535937921, 0.537927006, 0.539914885,
+    0.541901494, 0.543886772, 0.545870657, 0.547853086, 0.549833997, 0.55181333,  0.553791023,
+    0.555767014, 0.557741243, 0.559713649, 0.561684172, 0.56365275,  0.565619325, 0.567583836,
+    0.569546224, 0.571506429, 0.573464394, 0.575420058, 0.577373363, 0.579324252, 0.581272667,
+    0.583218549, 0.585161842, 0.58710249,  0.589040434, 0.59097562,  0.59290799,  0.594837491,
+    0.596764066, 0.59868766,  0.60060822,  0.60252569,  0.604440017, 0.606351149, 0.608259031,
+    0.610163611, 0.612064837, 0.613962657, 0.61585702,  0.617747875, 0.61963517,  0.621518857,
+    0.623398885, 0.625275204, 0.627147766, 0.629016523, 0.630881426, 0.632742428, 0.634599482,
+    0.63645254,  0.638301558, 0.640146488, 0.641987286, 0.643823907, 0.645656306, 0.64748444,
+    0.649308265, 0.651127739, 0.652942818, 0.654753461, 0.656559626, 0.658361272, 0.66015836,
+    0.661950848, 0.663738697, 0.665521869, 0.667300325, 0.669074026, 0.670842936, 0.672607017,
+    0.674366233, 0.676120548, 0.677869926, 0.679614333, 0.681353734, 0.683088095, 0.684817383,
+    0.686541565, 0.688260608, 0.689974481, 0.691683153, 0.693386592, 0.695084769, 0.696777653,
+    0.698465216, 0.700147429, 0.701824263, 0.703495691, 0.705161686, 0.706822221, 0.70847727,
+    0.710126808, 0.71177081,  0.71340925,  0.715042106, 0.716669353, 0.718290968, 0.71990693,
+    0.721517216, 0.723121805, 0.724720676, 0.726313808, 0.727901182, 0.729482779, 0.731058579,
+    0.732628564, 0.734192716, 0.735751018, 0.737303454, 0.738850006, 0.740390659, 0.741925398,
+    0.743454208, 0.744977074, 0.746493983, 0.748004922, 0.749509876, 0.751008835, 0.752501785,
+    0.753988716, 0.755469617, 0.756944477, 0.758413287, 0.759876035, 0.761332715, 0.762783316,
+    0.764227831, 0.765666252, 0.767098572, 0.768524783, 0.769944881, 0.771358858, 0.772766709,
+    0.774168429, 0.775564014, 0.77695346,  0.778336762, 0.779713917, 0.781084923, 0.782449776,
+    0.783808476, 0.78516102,  0.786507407, 0.787847636, 0.789181707, 0.790509619, 0.791831373,
+    0.79314697,  0.794456411, 0.795759698, 0.797056831, 0.798347814, 0.79963265,  0.80091134,
+    0.802183889, 0.803450299, 0.804710577, 0.805964724, 0.807212748, 0.808454651, 0.809690441,
+    0.810920123, 0.812143702, 0.813361186, 0.814572581, 0.815777894, 0.816977132, 0.818170304,
+    0.819357418, 0.820538481, 0.821713502, 0.82288249,  0.824045455, 0.825202406, 0.826353353,
+    0.827498306, 0.828637274, 0.82977027,  0.830897303, 0.832018385, 0.833133528, 0.834242742,
+    0.83534604,  0.836443435, 0.837534937, 0.838620561, 0.83970032,  0.840774225, 0.841842291,
+    0.842904531, 0.843960959, 0.84501159,  0.846056436, 0.847095514, 0.848128836, 0.84915642,
+    0.850178278, 0.851194427, 0.852204883, 0.85320966,  0.854208775, 0.855202244, 0.856190082,
+    0.857172307, 0.858148935, 0.859119982, 0.860085466, 0.861045403, 0.861999811, 0.862948707,
+    0.863892109, 0.864830034, 0.8657625,   0.866689525, 0.867611126, 0.868527324, 0.869438134,
+    0.870343577, 0.871243671, 0.872138434, 0.873027885, 0.873912043, 0.874790928, 0.875664558,
+    0.876532952, 0.877396131, 0.878254114, 0.879106919, 0.879954567, 0.880797078, 0.881634471,
+    0.882466767, 0.883293985, 0.884116145, 0.884933268, 0.885745374, 0.886552483, 0.887354615,
+    0.888151792, 0.888944033, 0.88973136,  0.890513792, 0.89129135,  0.892064056, 0.89283193,
+    0.893594992, 0.894353264, 0.895106767, 0.895855521, 0.896599549, 0.897338869, 0.898073505,
+    0.898803476, 0.899528804, 0.900249511, 0.900965617, 0.901677143, 0.902384111, 0.903086543,
+    0.903784458, 0.90447788,  0.905166828, 0.905851324, 0.90653139,  0.907207047, 0.907878316,
+    0.908545218, 0.909207776, 0.90986601,  0.910519941, 0.911169591, 0.911814981, 0.912456133,
+    0.913093067, 0.913725806, 0.914354369, 0.91497878,  0.915599058, 0.916215226, 0.916827304,
+    0.917435313, 0.918039275, 0.91863921,  0.919235141, 0.919827088, 0.920415072, 0.920999114,
+    0.921579235, 0.922155456, 0.922727798, 0.923296282, 0.923860929, 0.92442176,  0.924978795,
+    0.925532055, 0.926081561, 0.926627334, 0.927169394, 0.927707762, 0.928242458, 0.928773503,
+    0.929300917, 0.929824721, 0.930344935, 0.93086158,  0.931374675, 0.931884241, 0.932390297,
+    0.932892865, 0.933391964, 0.933887615, 0.934379836, 0.934868648, 0.93535407,  0.935836124,
+    0.936314827, 0.9367902,   0.937262263, 0.937731034, 0.938196534, 0.938658781, 0.939117796,
+    0.939573597, 0.940026203, 0.940475634, 0.940921909, 0.941365046, 0.941805065, 0.942241985,
+    0.942675824, 0.943106601, 0.943534335, 0.943959044, 0.944380747, 0.944799462, 0.945215208,
+    0.945628003, 0.946037865, 0.946444813, 0.946848864, 0.947250036, 0.947648348, 0.948043817,
+    0.948436462, 0.948826299, 0.949213347, 0.949597623, 0.949979144, 0.950357929, 0.950733994,
+    0.951107357, 0.951478034, 0.951846044, 0.952211402, 0.952574127, 0.952934234, 0.953291742,
+    0.953646665, 0.953999022, 0.954348829, 0.954696102, 0.955040858, 0.955383113, 0.955722883,
+    0.956060185, 0.956395034, 0.956727447, 0.95705744,  0.957385028, 0.957710228, 0.958033055,
+    0.958353525, 0.958671653, 0.958987455, 0.959300946, 0.959612142, 0.959921058, 0.960227709,
+    0.960532111, 0.960834277, 0.961134224, 0.961431966, 0.961727518, 0.962020894, 0.962312109,
+    0.962601179, 0.962888117, 0.963172937, 0.963455655, 0.963736284, 0.964014838, 0.964291332,
+    0.96456578,  0.964838195, 0.965108591, 0.965376983, 0.965643384, 0.965907808, 0.966170267,
+    0.966430777, 0.966689349, 0.966945998, 0.967200737, 0.967453578, 0.967704535, 0.967953622,
+    0.96820085,  0.968446233, 0.968689784, 0.968931516, 0.96917144,  0.969409571, 0.969645919,
+    0.969880498, 0.97011332,  0.970344398, 0.970573743, 0.970801367, 0.971027284, 0.971251504,
+    0.97147404,  0.971694904, 0.971914107, 0.972131661, 0.972347578, 0.972561869, 0.972774546,
+    0.97298562,  0.973195103, 0.973403006, 0.973609341, 0.973814117, 0.974017347, 0.974219042,
+    0.974419212, 0.974617868, 0.974815021, 0.975010683, 0.975204863, 0.975397572, 0.97558882,
+    0.975778619, 0.975966979, 0.97615391,  0.976339422, 0.976523525, 0.97670623,  0.976887547,
+    0.977067486, 0.977246057, 0.977423269, 0.977599132, 0.977773657, 0.977946853, 0.978118729,
+    0.978289296, 0.978458562, 0.978626537, 0.978793231, 0.978958653, 0.979122812, 0.979285717,
+    0.979447378, 0.979607804, 0.979767003, 0.979924985, 0.980081758, 0.980237332, 0.980391715,
+    0.980544915, 0.980696943, 0.980847805, 0.980997512, 0.981146071, 0.98129349,  0.981439779,
+    0.981584945, 0.981728996, 0.981871942, 0.98201379,  0.982154548, 0.982294225, 0.982432827,
+    0.982570364, 0.982706843, 0.982842273, 0.982976659, 0.983110012, 0.983242337, 0.983373644,
+    0.983503939, 0.983633229, 0.983761524, 0.983888829, 0.984015152, 0.9841405,   0.984264882,
+    0.984388303, 0.984510772, 0.984632294, 0.984752879, 0.984872531, 0.984991259, 0.985109069,
+    0.985225968, 0.985341963, 0.985457061, 0.985571269, 0.985684592, 0.985797039, 0.985908614,
+    0.986019326, 0.98612918,  0.986238183, 0.986346341, 0.986453661, 0.986560148, 0.98666581,
+    0.986770653, 0.986874682, 0.986977903, 0.987080324, 0.98718195,  0.987282786, 0.987382839,
+    0.987482115, 0.98758062,  0.98767836,  0.987775339, 0.987871565, 0.987967043, 0.988061778,
+    0.988155776, 0.988249042, 0.988341583, 0.988433404, 0.98852451,  0.988614907, 0.9887046,
+    0.988793594, 0.988881895, 0.988969507, 0.989056437, 0.98914269,  0.98922827,  0.989313183,
+    0.989397433, 0.989481027, 0.989563968, 0.989646262, 0.989727914, 0.989808929, 0.989889312,
+    0.989969066, 0.990048198, 0.990126712, 0.990204613, 0.990281905, 0.990358593, 0.990434681,
+    0.990510175, 0.990585079, 0.990659397, 0.990733134, 0.990806295, 0.990878883, 0.990950903,
+    0.99102236,  0.991093257, 0.9911636,   0.991233391, 0.991302637, 0.99137134,  0.991439506,
+    0.991507137, 0.991574239, 0.991640815, 0.991706869, 0.991772406, 0.991837429, 0.991901942,
+    0.99196595,  0.992029456, 0.992092463, 0.992154977, 0.992217,    0.992278537, 0.992339591,
+    0.992400166, 0.992460265, 0.992519893, 0.992579053, 0.992637749, 0.992695983, 0.99275376,
+    0.992811084, 0.992867957, 0.992924384, 0.992980367, 0.993035911, 0.993091018, 0.993145692,
+    0.993199936, 0.993253754, 0.993307149, 0.993360124, 0.993412683, 0.993464828, 0.993516563,
+    0.993567892, 0.993618816, 0.99366934,  0.993719466, 0.993769198, 0.993818539, 0.993867491,
+    0.993916059, 0.993964243, 0.994012049, 0.994059478, 0.994106533, 0.994153219, 0.994199536,
+    0.994245489, 0.994291079, 0.994336311, 0.994381186, 0.994425708, 0.994469878, 0.994513701,
+    0.994557178, 0.994600313, 0.994643108, 0.994685565, 0.994727688, 0.994769478, 0.994810939,
+    0.994852073, 0.994892883, 0.994933371, 0.994973539, 0.995013391, 0.995052928, 0.995092153,
+    0.995131069, 0.995169677, 0.995207981, 0.995245983, 0.995283685, 0.995321089, 0.995358198,
+    0.995395014, 0.995431539, 0.995467776, 0.995503727, 0.995539394, 0.995574779, 0.995609885,
+    0.995644713, 0.995679266, 0.995713547, 0.995747556, 0.995781297, 0.995814772, 0.995847981,
+    0.995880929, 0.995913616, 0.995946044, 0.995978217, 0.996010135, 0.996041801, 0.996073216,
+    0.996104383, 0.996135304, 0.99616598,  0.996196413, 0.996226606, 0.996256561, 0.996286278,
+    0.99631576,  0.996345009, 0.996374027, 0.996402815, 0.996431375, 0.99645971,  0.99648782,
+    0.996515708, 0.996543375, 0.996570823, 0.996598054, 0.99662507,  0.996651872, 0.996678461,
+    0.99670484,  0.99673101,  0.996756974, 0.996782731, 0.996808285, 0.996833636, 0.996858787,
+    0.996883738, 0.996908492, 0.99693305,  0.996957413, 0.996981584, 0.997005563, 0.997029352,
+    0.997052952, 0.997076366, 0.997099594, 0.997122638, 0.9971455,   0.99716818,  0.997190681,
+    0.997213004, 0.997235149, 0.99725712,  0.997278916, 0.997300539, 0.997321991, 0.997343273,
+    0.997364386, 0.997385332, 0.997406112, 0.997426727, 0.997447179, 0.997467468, 0.997487597,
+    0.997507566, 0.997527377, 0.997547031, 0.997566528, 0.997585872, 0.997605062, 0.997624099,
+    0.997642986, 0.997661723, 0.997680312, 0.997698752, 0.997717047, 0.997735197, 0.997753202,
+    0.997771065, 0.997788786, 0.997806367, 0.997823808, 0.99784111,  0.997858276, 0.997875305,
+    0.997892199, 0.997908959, 0.997925586, 0.997942081, 0.997958445, 0.99797468,  0.997990785,
+    0.998006763, 0.998022614, 0.998038339, 0.998053939, 0.998069415, 0.998084769, 0.998100001,
+    0.998115112, 0.998130102, 0.998144974, 0.998159728, 0.998174365, 0.998188885, 0.99820329,
+    0.998217581, 0.998231759, 0.998245823, 0.998259777, 0.998273619, 0.998287351, 0.998300975,
+    0.99831449,  0.998327898, 0.998341199, 0.998354395, 0.998367486, 0.998380473, 0.998393356,
+    0.998406138, 0.998418818, 0.998431397, 0.998443876, 0.998456256, 0.998468538, 0.998480723,
+    0.99849281,  0.998504802, 0.998516698, 0.998528499, 0.998540207, 0.998551822, 0.998563345,
+    0.998574776, 0.998586116, 0.998597366, 0.998608527, 0.998619599, 0.998630583, 0.99864148,
+    0.99865229,  0.998663015, 0.998673654, 0.998684208, 0.998694679, 0.998705066, 0.998715371,
+    0.998725594, 0.998735736, 0.998745797, 0.998755778, 0.99876568,  0.998775503, 0.998785248,
+    0.998794916, 0.998804507, 0.998814021, 0.99882346,  0.998832824, 0.998842113, 0.998851329,
+    0.998860471, 0.998869541, 0.998878538, 0.998887464, 0.998896319, 0.998905104, 0.998913818,
+    0.998922464, 0.99893104,  0.998939549, 0.99894799,  0.998956364, 0.998964671, 0.998972912,
+    0.998981088, 0.998989198, 0.998997244, 0.999005226, 0.999013145, 0.999021001, 0.999028794,
+    0.999036525, 0.999044195, 0.999051803, 0.999059352, 0.99906684,  0.999074268, 0.999081638,
+    0.999088949, 0.999096202, 0.999103397, 0.999110535, 0.999117616, 0.99912464,  0.999131609,
+    0.999138523, 0.999145381, 0.999152185, 0.999158935, 0.999165631, 0.999172274, 0.999178864,
+    0.999185401, 0.999191887, 0.999198321, 0.999204704, 0.999211036, 0.999217317, 0.999223549,
+    0.999229731, 0.999235864, 0.999241948, 0.999247984, 0.999253971, 0.999259911, 0.999265804,
+    0.99927165,  0.999277449, 0.999283202, 0.99928891,  0.999294572, 0.999300189, 0.999305761,
+    0.999311289, 0.999316773, 0.999322213, 0.99932761,  0.999332964, 0.999338276, 0.999343545,
+    0.999348772, 0.999353958, 0.999359103, 0.999364206, 0.999369269, 0.999374291, 0.999379274,
+    0.999384217, 0.999389121, 0.999393985, 0.999398811, 0.999403599, 0.999408348, 0.99941306,
+    0.999417734, 0.99942237,  0.99942697,  0.999431534, 0.999436061, 0.999440552, 0.999445007,
+    0.999449427, 0.999453811, 0.999458161, 0.999462476, 0.999466757, 0.999471004, 0.999475217,
+    0.999479396, 0.999483542, 0.999487655, 0.999491735, 0.999495783, 0.999499799, 0.999503783,
+    0.999507735, 0.999511655, 0.999515544, 0.999519403, 0.99952323,  0.999527027, 0.999530794,
+    0.999534531, 0.999538238, 0.999541916, 0.999545564, 0.999549184, 0.999552774, 0.999556336,
+    0.99955987,  0.999563375, 0.999566853, 0.999570303, 0.999573725, 0.99957712,  0.999580488,
+    0.99958383,  0.999587145, 0.999590433, 0.999593695, 0.999596931, 0.999600142, 0.999603326,
+    0.999606486, 0.99960962,  0.99961273,  0.999615814, 0.999618874, 0.99962191,  0.999624921,
+    0.999627909, 0.999630873, 0.999633813, 0.99963673,  0.999639623, 0.999642494, 0.999645341,
+    0.999648166, 0.999650969, 0.999653749, 0.999656507, 0.999659243, 0.999661957, 0.498000011,
+    0.496000085, 0.494000288, 0.492000683, 0.490001333, 0.488002303, 0.486003658, 0.484005459,
+    0.482007772, 0.48001066,  0.478014186, 0.476018415, 0.474023409, 0.472029233, 0.470035948,
+    0.468043619, 0.466052309, 0.464062079, 0.462072994, 0.460085115, 0.458098506, 0.456113228,
+    0.454129343, 0.452146914, 0.450166003, 0.44818667,  0.446208977, 0.444232986, 0.442258757,
+    0.440286351, 0.438315828, 0.43634725,  0.434380675, 0.432416164, 0.430453776, 0.428493571,
+    0.426535606, 0.424579942, 0.422626637, 0.420675748, 0.418727333, 0.416781451, 0.414838158,
+    0.41289751,  0.410959566, 0.40902438,  0.40709201,  0.405162509, 0.403235934, 0.40131234,
+    0.39939178,  0.39747431,  0.395559983, 0.393648851, 0.391740969, 0.389836389, 0.387935163,
+    0.386037343, 0.38414298,  0.382252125, 0.38036483,  0.378481143, 0.376601115, 0.374724796,
+    0.372852234, 0.370983477, 0.369118574, 0.367257572, 0.365400518, 0.36354746,  0.361698442,
+    0.359853512, 0.358012714, 0.356176093, 0.354343694, 0.35251556,  0.350691735, 0.348872261,
+    0.347057182, 0.345246539, 0.343440374, 0.341638728, 0.33984164,  0.338049152, 0.336261303,
+    0.334478131, 0.332699675, 0.330925974, 0.329157064, 0.327392983, 0.325633767, 0.323879452,
+    0.322130074, 0.320385667, 0.318646266, 0.316911905, 0.315182617, 0.313458435, 0.311739392,
+    0.310025519, 0.308316847, 0.306613408, 0.304915231, 0.303222347, 0.301534784, 0.299852571,
+    0.298175737, 0.296504309, 0.294838314, 0.293177779, 0.29152273,  0.289873192, 0.28822919,
+    0.28659075,  0.284957894, 0.283330647, 0.281709032, 0.28009307,  0.278482784, 0.276878195,
+    0.275279324, 0.273686192, 0.272098818, 0.270517221, 0.268941421, 0.267371436, 0.265807284,
+    0.264248982, 0.262696546, 0.261149994, 0.259609341, 0.258074602, 0.256545792, 0.255022926,
+    0.253506017, 0.251995078, 0.250490124, 0.248991165, 0.247498215, 0.246011284, 0.244530383,
+    0.243055523, 0.241586713, 0.240123965, 0.238667285, 0.237216684, 0.235772169, 0.234333748,
+    0.232901428, 0.231475217, 0.230055119, 0.228641142, 0.227233291, 0.225831571, 0.224435986,
+    0.22304654,  0.221663238, 0.220286083, 0.218915077, 0.217550224, 0.216191524, 0.21483898,
+    0.213492593, 0.212152364, 0.210818293, 0.209490381, 0.208168627, 0.20685303,  0.205543589,
+    0.204240302, 0.202943169, 0.201652186, 0.20036735,  0.19908866,  0.197816111, 0.196549701,
+    0.195289423, 0.194035276, 0.192787252, 0.191545349, 0.190309559, 0.189079877, 0.187856298,
+    0.186638814, 0.185427419, 0.184222106, 0.183022868, 0.181829696, 0.180642582, 0.179461519,
+    0.178286498, 0.17711751,  0.175954545, 0.174797594, 0.173646647, 0.172501694, 0.171362726,
+    0.17022973,  0.169102697, 0.167981615, 0.166866472, 0.165757258, 0.16465396,  0.163556565,
+    0.162465063, 0.161379439, 0.16029968,  0.159225775, 0.158157709, 0.157095469, 0.156039041,
+    0.15498841,  0.153943564, 0.152904486, 0.151871164, 0.15084358,  0.149821722, 0.148805573,
+    0.147795117, 0.14679034,  0.145791225, 0.144797756, 0.143809918, 0.142827693, 0.141851065,
+    0.140880018, 0.139914534, 0.138954597, 0.138000189, 0.137051293, 0.136107891, 0.135169966,
+    0.1342375,   0.133310475, 0.132388874, 0.131472676, 0.130561866, 0.129656423, 0.128756329,
+    0.127861566, 0.126972115, 0.126087957, 0.125209072, 0.124335442, 0.123467048, 0.122603869,
+    0.121745886, 0.120893081, 0.120045433, 0.119202922, 0.118365529, 0.117533233, 0.116706015,
+    0.115883855, 0.115066732, 0.114254626, 0.113447517, 0.112645385, 0.111848208, 0.111055967,
+    0.11026864,  0.109486208, 0.10870865,  0.107935944, 0.10716807,  0.106405008, 0.105646736,
+    0.104893233, 0.104144479, 0.103400451, 0.102661131, 0.101926495, 0.101196524, 0.100471196,
+    0.099750489, 0.099034383, 0.098322857, 0.097615889, 0.096913457, 0.096215542, 0.09552212,
+    0.094833172, 0.094148676, 0.09346861,  0.092792953, 0.092121684, 0.091454782, 0.090792224,
+    0.09013399,  0.089480059, 0.088830409, 0.088185019, 0.087543867, 0.086906933, 0.086274194,
+    0.085645631, 0.08502122,  0.084400942, 0.083784774, 0.083172696, 0.082564687, 0.081960725,
+    0.08136079,  0.080764859, 0.080172912, 0.079584928, 0.079000886, 0.078420765, 0.077844544,
+    0.077272202, 0.076703718, 0.076139071, 0.07557824,  0.075021205, 0.074467945, 0.073918439,
+    0.073372666, 0.072830606, 0.072292238, 0.071757542, 0.071226497, 0.070699083, 0.070175279,
+    0.069655065, 0.06913842,  0.068625325, 0.068115759, 0.067609703, 0.067107135, 0.066608036,
+    0.066112385, 0.065620164, 0.065131352, 0.06464593,  0.064163876, 0.063685173, 0.0632098,
+    0.062737737, 0.062268966, 0.061803466, 0.061341219, 0.060882204, 0.060426403, 0.059973797,
+    0.059524366, 0.059078091, 0.058634954, 0.058194935, 0.057758015, 0.057324176, 0.056893399,
+    0.056465665, 0.056040956, 0.055619253, 0.055200538, 0.054784792, 0.054371997, 0.053962135,
+    0.053555187, 0.053151136, 0.052749964, 0.052351652, 0.051956183, 0.051563538, 0.051173701,
+    0.050786653, 0.050402377, 0.050020856, 0.049642071, 0.049266006, 0.048892643, 0.048521966,
+    0.048153956, 0.047788598, 0.047425873, 0.047065766, 0.046708258, 0.046353335, 0.046000978,
+    0.045651171, 0.045303898, 0.044959142, 0.044616887, 0.044277117, 0.043939815, 0.043604966,
+    0.043272553, 0.04294256,  0.042614972, 0.042289772, 0.041966945, 0.041646475, 0.041328347,
+    0.041012545, 0.040699054, 0.040387858, 0.040078942, 0.039772291, 0.039467889, 0.039165723,
+    0.038865776, 0.038568034, 0.038272482, 0.037979106, 0.037687891, 0.037398821, 0.037111883,
+    0.036827063, 0.036544345, 0.036263716, 0.035985162, 0.035708668, 0.03543422,  0.035161805,
+    0.034891409, 0.034623017, 0.034356616, 0.034092192, 0.033829733, 0.033569223, 0.033310651,
+    0.033054002, 0.032799263, 0.032546422, 0.032295465, 0.032046378, 0.03179915,  0.031553767,
+    0.031310216, 0.031068484, 0.03082856,  0.030590429, 0.030354081, 0.030119502, 0.02988668,
+    0.029655602, 0.029426257, 0.029198633, 0.028972716, 0.028748496, 0.02852596,  0.028305096,
+    0.028085893, 0.027868339, 0.027652422, 0.027438131, 0.027225454, 0.02701438,  0.026804897,
+    0.026596994, 0.026390659, 0.026185883, 0.025982653, 0.025780958, 0.025580788, 0.025382132,
+    0.025184979, 0.024989317, 0.024795137, 0.024602428, 0.02441118,  0.024221381, 0.024033021,
+    0.02384609,  0.023660578, 0.023476475, 0.02329377,  0.023112453, 0.022932514, 0.022753943,
+    0.022576731, 0.022400868, 0.022226343, 0.022053147, 0.021881271, 0.021710704, 0.021541438,
+    0.021373463, 0.021206769, 0.021041347, 0.020877188, 0.020714283, 0.020552622, 0.020392196,
+    0.020232997, 0.020075015, 0.019918242, 0.019762668, 0.019608285, 0.019455085, 0.019303057,
+    0.019152195, 0.019002488, 0.018853929, 0.01870651,  0.018560221, 0.018415055, 0.018271004,
+    0.018128058, 0.01798621,  0.017845452, 0.017705775, 0.017567173, 0.017429636, 0.017293157,
+    0.017157727, 0.017023341, 0.016889988, 0.016757663, 0.016626356, 0.016496061, 0.016366771,
+    0.016238476, 0.016111171, 0.015984848, 0.0158595,   0.015735118, 0.015611697, 0.015489228,
+    0.015367706, 0.015247121, 0.015127469, 0.015008741, 0.014890931, 0.014774032, 0.014658037,
+    0.014542939, 0.014428731, 0.014315408, 0.014202961, 0.014091386, 0.013980674, 0.01387082,
+    0.013761817, 0.013653659, 0.013546339, 0.013439852, 0.01333419,  0.013229347, 0.013125318,
+    0.013022097, 0.012919676, 0.01281805,  0.012717214, 0.012617161, 0.012517885, 0.01241938,
+    0.01232164,  0.012224661, 0.012128435, 0.012032957, 0.011938222, 0.011844224, 0.011750958,
+    0.011658417, 0.011566596, 0.01147549,  0.011385093, 0.0112954,   0.011206406, 0.011118105,
+    0.011030493, 0.010943563, 0.01085731,  0.01077173,  0.010686817, 0.010602567, 0.010518973,
+    0.010436032, 0.010353738, 0.010272086, 0.010191071, 0.010110688, 0.010030934, 0.009951802,
+    0.009873288, 0.009795387, 0.009718095, 0.009641407, 0.009565319, 0.009489825, 0.009414921,
+    0.009340603, 0.009266866, 0.009193705, 0.009121117, 0.009049097, 0.00897764,  0.008906743,
+    0.0088364,   0.008766609, 0.008697363, 0.00862866,  0.008560494, 0.008492863, 0.008425761,
+    0.008359185, 0.008293131, 0.008227594, 0.008162571, 0.008098058, 0.00803405,  0.007970544,
+    0.007907537, 0.007845023, 0.007783,    0.007721463, 0.007660409, 0.007599834, 0.007539735,
+    0.007480107, 0.007420947, 0.007362251, 0.007304017, 0.00724624,  0.007188916, 0.007132043,
+    0.007075616, 0.007019633, 0.006964089, 0.006908982, 0.006854308, 0.006800064, 0.006746246,
+    0.006692851, 0.006639876, 0.006587317, 0.006535172, 0.006483437, 0.006432108, 0.006381184,
+    0.00633066,  0.006280534, 0.006230802, 0.006181461, 0.006132509, 0.006083941, 0.006035757,
+    0.005987951, 0.005940522, 0.005893467, 0.005846781, 0.005800464, 0.005754511, 0.005708921,
+    0.005663689, 0.005618814, 0.005574292, 0.005530122, 0.005486299, 0.005442822, 0.005399687,
+    0.005356892, 0.005314435, 0.005272312, 0.005230522, 0.005189061, 0.005147927, 0.005107117,
+    0.005066629, 0.005026461, 0.004986609, 0.004947072, 0.004907847, 0.004868931, 0.004830323,
+    0.004792019, 0.004754017, 0.004716315, 0.004678911, 0.004641802, 0.004604986, 0.004568461,
+    0.004532224, 0.004496273, 0.004460606, 0.004425221, 0.004390115, 0.004355287, 0.004320734,
+    0.004286453, 0.004252444, 0.004218703, 0.004185228, 0.004152019, 0.004119071, 0.004086384,
+    0.004053956, 0.004021783, 0.003989865, 0.003958199, 0.003926784, 0.003895617, 0.003864696,
+    0.00383402,  0.003803587, 0.003773394, 0.003743439, 0.003713722, 0.00368424,  0.003654991,
+    0.003625973, 0.003597185, 0.003568625, 0.00354029,  0.00351218,  0.003484292, 0.003456625,
+    0.003429177, 0.003401946, 0.00337493,  0.003348128, 0.003321539, 0.00329516,  0.00326899,
+    0.003243026, 0.003217269, 0.003191715, 0.003166364, 0.003141213, 0.003116262, 0.003091508,
+    0.00306695,  0.003042587, 0.003018416, 0.002994437, 0.002970648, 0.002947048, 0.002923634,
+    0.002900406, 0.002877362, 0.0028545,   0.00283182,  0.002809319, 0.002786996, 0.002764851,
+    0.00274288,  0.002721084, 0.002699461, 0.002678009, 0.002656727, 0.002635614, 0.002614668,
+    0.002593888, 0.002573273, 0.002552821, 0.002532532, 0.002512403, 0.002492434, 0.002472623,
+    0.002452969, 0.002433472, 0.002414128, 0.002394938, 0.002375901, 0.002357014, 0.002338277,
+    0.002319688, 0.002301248, 0.002282953, 0.002264803, 0.002246798, 0.002228935, 0.002211214,
+    0.002193633, 0.002176192, 0.00215889,  0.002141724, 0.002124695, 0.002107801, 0.002091041,
+    0.002074414, 0.002057919, 0.002041555, 0.00202532,  0.002009215, 0.001993237, 0.001977386,
+    0.001961661, 0.001946061, 0.001930585, 0.001915231, 0.001899999, 0.001884888, 0.001869898,
+    0.001855026, 0.001840272, 0.001825635, 0.001811115, 0.00179671,  0.001782419, 0.001768241,
+    0.001754177, 0.001740223, 0.001726381, 0.001712649, 0.001699025, 0.00168551,  0.001672102,
+    0.001658801, 0.001645605, 0.001632514, 0.001619527, 0.001606644, 0.001593862, 0.001581182,
+    0.001568603, 0.001556124, 0.001543744, 0.001531462, 0.001519277, 0.00150719,  0.001495198,
+    0.001483302, 0.001471501, 0.001459793, 0.001448178, 0.001436655, 0.001425224, 0.001413884,
+    0.001402634, 0.001391473, 0.001380401, 0.001369417, 0.00135852,  0.00134771,  0.001336985,
+    0.001326346, 0.001315792, 0.001305321, 0.001294934, 0.001284629, 0.001274406, 0.001264264,
+    0.001254203, 0.001244222, 0.00123432,  0.001224497, 0.001214752, 0.001205084, 0.001195493,
+    0.001185979, 0.00117654,  0.001167176, 0.001157887, 0.001148671, 0.001139529, 0.001130459,
+    0.001121462, 0.001112536, 0.001103681, 0.001094896, 0.001086182, 0.001077536, 0.00106896,
+    0.001060451, 0.00105201,  0.001043636, 0.001035329, 0.001027088, 0.001018912, 0.001010802,
+    0.001002756, 0.000994774, 0.000986855, 0.000978999, 0.000971206, 0.000963475, 0.000955805,
+    0.000948197, 0.000940648, 0.00093316,  0.000925732, 0.000918362, 0.000911051, 0.000903798,
+    0.000896603, 0.000889465, 0.000882384, 0.00087536,  0.000868391, 0.000861477, 0.000854619,
+    0.000847815, 0.000841065, 0.000834369, 0.000827726, 0.000821136, 0.000814599, 0.000808113,
+    0.000801679, 0.000795296, 0.000788964, 0.000782683, 0.000776451, 0.000770269, 0.000764136,
+    0.000758052, 0.000752016, 0.000746029, 0.000740089, 0.000734196, 0.00072835,  0.000722551,
+    0.000716798, 0.00071109,  0.000705428, 0.000699811, 0.000694239, 0.000688711, 0.000683227,
+    0.000677787, 0.00067239,  0.000667036, 0.000661724, 0.000656455, 0.000651228, 0.000646042,
+    0.000640897, 0.000635794, 0.000630731, 0.000625709, 0.000620726, 0.000615783, 0.000610879,
+    0.000606015, 0.000601189, 0.000596401, 0.000591652, 0.00058694,  0.000582266, 0.00057763,
+    0.00057303,  0.000568466, 0.000563939, 0.000559448, 0.000554993, 0.000550573, 0.000546189,
+    0.000541839, 0.000537524, 0.000533243, 0.000528996, 0.000524783, 0.000520604, 0.000516458,
+    0.000512345, 0.000508265, 0.000504217, 0.000500201, 0.000496217, 0.000492265, 0.000488345,
+    0.000484456, 0.000480597, 0.00047677,  0.000472973, 0.000469206, 0.000465469, 0.000461762,
+    0.000458084, 0.000454436, 0.000450816, 0.000447226, 0.000443664, 0.00044013,  0.000436625,
+    0.000433147, 0.000429697, 0.000426275, 0.00042288,  0.000419512, 0.00041617,  0.000412855,
+    0.000409567, 0.000406305, 0.000403069, 0.000399858, 0.000396674, 0.000393514, 0.00039038,
+    0.00038727,  0.000384186, 0.000381126, 0.00037809,  0.000375079, 0.000372091, 0.000369127,
+    0.000366187, 0.00036327,  0.000360377, 0.000357506, 0.000354659, 0.000351834, 0.000349031,
+    0.000346251, 0.000343493, 0.000340757, 0.000338043, 0.00033535};
+
+// static bool check_input_int8_range(float input)
+//{
+//  bool ret = input > -128.0 && input < 128.0;
+//  if (!ret) {
+//    printf("invalid int8 range, input is %f\n", input);
+//  }
+//  return ret;
+//}
+
+static double _gen_sigmoid(float x) { return 1.0 / (1.0 + exp(-(x))); }
+
+static void tl_lut_ref(uint16_t *ofmap, uint16_t *ifmap, uint16_t *table, uint16_t *table_slope,
+                       cvk_tl_shape_t ifmap_shape, cvk_tl_shape_t table_shape, int range_start,
+                       int range_end) {
+  int tn, th, tw;
+
+  tn = table_shape.n;
+  th = table_shape.h;
+  tw = table_shape.w;
+  (void)tn;
+  (void)th;
+  (void)tw;
+  (void)table;
+  (void)table_slope;
+  (void)range_start;
+  (void)range_end;
+  assert(tn == 1);
+  assert(th * tw == 256);
+  assert(table);
+  assert(table_slope);
+  assert(ifmap_shape.n);
+  assert(ifmap);
+  assert(ofmap);
+
+  // TODO: use c function
+  // 1. dump all input as binary file
+#ifdef GDB
+#define INFP32FILE "infp32file.bin"
+#define OUTBF16FILE "lutbf16out.bin"
+  FILE *pFile;
+  pFile = fopen(INFP32FILE, "wb");
+  int shape_sz = tl_shape_size(&ifmap_shape);
+  float *f = new float[shape_sz];
+  for (int i = 0; i < shape_sz; i++) {
+    f[i] = convert_bf16_fp32(ifmap[i]);
+  }
+  fwrite(f, 1, shape_sz * sizeof(float), pFile);
+  fclose(pFile);
+
+  // 2. read result from `eval_lut.py`
+  char command[256];
+  sprintf(command,
+          "python eval_lut.py --lut_input_range_start %d --lut_input_range_end "
+          "%d --inputfloat32 %s --outputbf16 %s 2>&1 > 2\n",
+          range_start, range_end, INFP32FILE, OUTBF16FILE);
+
+  int r;
+  r = system(command);
+  printf("command is %s, return %d\n", command, r);
+  assert(r != 0);
+
+  pFile = fopen(OUTBF16FILE, "rb");
+  if (!pFile) {
+    fprintf(stderr, "open golden %s fail\n", OUTBF16FILE);
+    exit(-1);
+  }
+
+  size_t file_length;
+  file_length = fread(ofmap, sizeof(uint16_t), tl_shape_size(&ifmap_shape), pFile);
+  printf("read from golden, file size %lu\n", file_length);
+  fclose(pFile);
+#else
+  assert(range_start);
+  assert(range_end);
+  for (uint64_t i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    ofmap[i] = convert_fp32_bf16(_gen_sigmoid(convert_bf16_fp32(ifmap[i])));
+  }
+#endif
+
+#ifdef GDB
+  for (uint64_t i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    printf("ref %lu input 0x%x(%f) golden 0x%x(%f)\n", i, ifmap[i], convert_bf16_fp32(ifmap[i]),
+           ofmap[i], convert_bf16_fp32(ofmap[i]));
+  }
+#endif
+}
+
+static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint64_t ofmap_size) {
+  int count = 0;
+  uint64_t size = ofmap_size;
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    size = sizeof(sigmode_golden_bf16) / sizeof(uint16_t);
+  } else if (PRE_DATA_MAX_ERROR) {
+    size = sizeof(sigmode_golden) / sizeof(double);
+  }
+
+  for (uint64_t i = 0; i < size; i++) {
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      if (ofmap_data[i] != sigmode_golden_bf16[i]) {
+        fprintf(stderr, "[%d] comparing failed at ofmap_data[%lu], got %x, exp %x\n", count, i,
+                ofmap_data[i], sigmode_golden_bf16[i]);
+        exit(-1);
+      }
+    } else {
+      float got = convert_bf16_fp32(ofmap_data[i]);
+      float exp = convert_bf16_fp32(ref_data[i]);
+
+      if (mode == PRE_DATA_MAX_ERROR) {
+        // cus we have better accuracy ~ 0.0039
+        exp = sigmode_golden[i];
+      }
+
+      if (fabs(got - exp) > MAX_ERROR) {
+        fprintf(stderr,
+                "[%d] comparing failed at ofmap_data[%lu], got %x, exp %x, "
+                "diff(%f - %f) is %f\n",
+                count, i, ofmap_data[i], ref_data[i], got, exp, fabs(got - exp));
+        count++;
+      }
+    }
+  }
+
+  if (count != 0) {
+    printf("error count is %d\n", count);
+    exit(-1);
+  }
+
+  return true;
+}
+
+static void gen_input(uint16_t *ifmap, uint64_t ifmap_size) {
+  if (mode == PRE_DATA_COMPARE_FIX || mode == PRE_DATA_MAX_ERROR) {
+    memcpy(ifmap, &test_pattern, sizeof(test_pattern));
+
+#ifdef GDB
+    for (uint64_t i = 0; i < ifmap_size; i++) {
+      printf("source if[%lu] is bf16 %f (bf16)with 0x%x\n", i, convert_bf16_fp32(ifmap[i]),
+             ifmap[i]);
+    }
+#endif
+  } else {
+    int table_hw = 256;
+    for (uint64_t i = 0; i < ifmap_size; i++) {
+      // input range is -8 ~ +8
+      float input = ((int)i % 7) * (((int)i % 2) ? 1 : -1) + 0.03 + (i % table_hw) * 0.002;
+      // float input = ((int)i % 10) * (((int)i % 2) ? 1 : -1) + 0.03 + (i %
+      // table_hw) * 0.002;
+      // assert(check_input_int8_range(input));
+      ifmap[i] = convert_fp32_bf16(input);
+#ifdef GDB
+      printf("source if[%lu] is bf16 %f, input is %f (bf16)with 0x%x\n", i,
+             convert_bf16_fp32(ifmap[i]), input, ifmap[i]);
+#endif
+    }
+  }
+}
+
+static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk) {
+  // TODO: check more shape / align
+  cvk_tl_shape_t ifmap_shape;
+  if (mode == PRE_DATA_COMPARE_FIX || mode == PRE_DATA_MAX_ERROR) {
+    ifmap_shape = {1, 32, 8, 8};
+  } else {
+    ifmap_shape = {1, 32, 16, 16};
+  }
+
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+
+  // get table / input shape
+  cvk_tl_shape_t table_shape;
+  cvm_table_shape(bmk, &table_shape);
+  cvk_tl_shape_t ofmap_shape = ifmap_shape;
+
+  uint64_t ifmap_size = tl_shape_size(&ifmap_shape);
+  uint64_t table_size = tl_shape_size(&table_shape);
+  uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
+
+  int data_type_size = bytesize_of_fmt(fmt);
+  uint64_t ifmap_bytesize = ifmap_size * data_type_size;
+  uint64_t table_bytesize = table_size * data_type_size;
+  uint64_t ofmap_bytesize = ofmap_size * data_type_size;
+
+  // alloc tg
+  uint16_t *ifmap = (uint16_t *)xmalloc(ifmap_bytesize);
+  uint16_t *table_data = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *table_data_slope = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
+
+  // range depend on ur activation
+  int range_start = -8;
+  int range_end = 8;
+  float scale = cvm_sigmoid_scale(range_start, range_end);
+
+  // fill tg value
+  gen_input(ifmap, ifmap_size);
+  cvm_sigmoid_tbl(table_data, table_data_slope, &table_shape, range_start, range_end);
+  tl_lut_ref(ref_data, ifmap, table_data, table_data_slope, ifmap_shape, table_shape, range_start,
+             range_end);
+
+  // alloc tl
+  cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *cvk_tl_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *cvk_tl_table_answer_slope = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_buf = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
+
+  // sys->local
+  test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)ifmap);
+  test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer, (uint8_t *)table_data);
+  test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer_slope, (uint8_t *)table_data_slope);
+
+  // emit core function
+  cvm_emit_sigmoid(bmk, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_slope,
+                   tl_ofmap_bf16, scale);
+
+  uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, tl_ofmap_bf16);
+
+  verify(ofmap_data, ref_data, ofmap_size);
+
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, cvk_tl_table_answer_slope);
+  free_tl(bmk, cvk_tl_table_answer);
+  free_tl(bmk, tl_ifmap);
+
+  free(ifmap);
+  free(table_data);
+  free(table_data_slope);
+  free(ref_data);
+  free(ofmap_data);
+}
+
+int main() {
+  CVI_RT_HANDLE ctx;
+  cvk_context_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) {
+    // for (int i = GEN_DATA_MAX_ERROR; i < TEST_MODE_MAX; i++) {
+    // for (int i = PRE_DATA_MAX_ERROR; i < GEN_DATA_MAX_ERROR; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf("test mode %d...\n", mode);
+    testbench(&ctx, bmk);
+  }
+
+  test_exit(&ctx, bmk);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cvimath/tests/cvi1835/sqrt.cpp b/cvimath/tests/cvi1835/sqrt.cpp
new file mode 100644
index 000000000..60bbacb2d
--- /dev/null
+++ b/cvimath/tests/cvi1835/sqrt.cpp
@@ -0,0 +1,375 @@
+/**
+ */
+#include <cvimath_internal.h>
+#include <test_cvikernel_util.h>
+
+#include <cfloat>
+#include <iomanip>
+#include <iostream>
+#include <map>
+#include <random>
+#include <string>
+//#define DBG
+
+using namespace std;
+
+/**
+ * pre_data means we test fixed pattern, it should be same sa lut
+ */
+enum TEST_MODE {
+  PRE_DATA_COMPARE_FIX = 0,   // pre-data + fix compare
+  GEN_POW_20_DATA_MAX_ERROR,  // generate 2^-20 ~ 2^20 value that check epsilon
+  TEST_MODE_MAX,
+};
+
+static TEST_MODE mode;
+
+static uint16_t test_pattern[] = {
+    0x0000, 0x38D2, 0x3952, 0x399D, 0x39D2, 0x3A03, 0x3A1D, 0x3A38, 0x3A52, 0x3A6C, 0x3A83, 0x3A90,
+    0x3A9D, 0x3AAA, 0x3AB8, 0x3AC5, 0x3AD2, 0x3ADF, 0x3AEC, 0x3AF9, 0x3B03, 0x3B0A, 0x3B10, 0x3B17,
+    0x3B1D, 0x3B24, 0x3B2A, 0x3B31, 0x3B38, 0x3B3E, 0x3B45, 0x3B4B, 0x3B52, 0x3B58, 0x3B5F, 0x3B65,
+    0x3B6C, 0x3B72, 0x3B79, 0x3B80, 0x3B83, 0x3B86, 0x3B8A, 0x3B8D, 0x3B90, 0x3B93, 0x3B97, 0x3B9A,
+    0x3B9D, 0x3BA1, 0x3BA4, 0x3BA7, 0x3BAA, 0x3BAE, 0x3BB1, 0x3BB4, 0x3BB8, 0x3BBB, 0x3BBE, 0x3BC1,
+    0x3BC5, 0x3BC8, 0x3BCB, 0x3BCE, 0x3BD2, 0x3BD5, 0x3BD8, 0x3BDC, 0x3BDF, 0x3BE2, 0x3BE5, 0x3BE9,
+    0x3BEC, 0x3BEF, 0x3BF2, 0x3BF6, 0x3BF9, 0x3BFC, 0x3C00, 0x3C01, 0x3C03, 0x3C05, 0x3C06, 0x3C08,
+    0x3C0A, 0x3C0B, 0x3C0D, 0x3C0F, 0x3C10, 0x3C12, 0x3C13, 0x3C15, 0x3C17, 0x3C18, 0x3C1A, 0x3C1C,
+    0x3C1D, 0x3C1F, 0x3C21, 0x3C22, 0x3C24, 0x3C25, 0x3C27, 0x3C29, 0x3C2A, 0x3C2C, 0x3C2E, 0x3C2F,
+    0x3C31, 0x3C33, 0x3C34, 0x3C36, 0x3C38, 0x3C39, 0x3C3B, 0x3C3C, 0x3C3E, 0x3C40, 0x3C41, 0x3C43,
+    0x3C45, 0x3C46, 0x3C48, 0x3C4A, 0x3C4B, 0x3C4D, 0x3C4E, 0x3C50, 0x3C52, 0x3C53, 0x3C55, 0x3C57,
+    0x3C58, 0x3C5A, 0x3C5C, 0x3C5D, 0x3C5F, 0x3C60, 0x3C62, 0x3C64, 0x3C65, 0x3C67, 0x3C69, 0x3C6A,
+    0x3C6C, 0x3C6E, 0x3C6F, 0x3C71, 0x3C72, 0x3C74, 0x3C76, 0x3C77, 0x3C79, 0x3C7B, 0x3C7C, 0x3C7E,
+    0x3C80, 0x3C81, 0x3C81, 0x3C82, 0x3C83, 0x3C84, 0x3C85, 0x3C86, 0x3C86, 0x3C87, 0x3C88, 0x3C89,
+    0x3C8A, 0x3C8A, 0x3C8B, 0x3C8C, 0x3C8D, 0x3C8E, 0x3C8F, 0x3C8F, 0x3C90, 0x3C91, 0x3C92, 0x3C93,
+    0x3C93, 0x3C94, 0x3C95, 0x3C96, 0x3C97, 0x3C98, 0x3C98, 0x3C99, 0x3C9A, 0x3C9B, 0x3C9C, 0x3C9C,
+    0x3C9D, 0x3C9E, 0x3C9F, 0x3CA0, 0x3CA1, 0x3CA1, 0x3CA2, 0x3CA3, 0x3CA4, 0x3CA5, 0x3CA5, 0x3CA6,
+    0x3CA7, 0x3CA8, 0x3CA9, 0x3CAA, 0x3CAA, 0x3CAB, 0x3CAC, 0x3CAD, 0x3CAE, 0x3CAE, 0x3CAF, 0x3CB0,
+    0x3CB1, 0x3CB2, 0x3CB3, 0x3CB3, 0x3CB4, 0x3CB5, 0x3CB6, 0x3CB7, 0x3CB8, 0x3CB8, 0x3CB9, 0x3CBA,
+    0x3CBB, 0x3CBC, 0x3CBC, 0x3CBD, 0x3CBE, 0x3CBF, 0x3CC0, 0x3CC1, 0x3CC1, 0x3CC2, 0x3CC3, 0x3CC4,
+    0x3CC5, 0x3CC5, 0x3CC6, 0x3CC7, 0x3CC8, 0x3CC9, 0x3CCA, 0x3CCA, 0x3CCB, 0x3CCC, 0x3CCD, 0x3CCE,
+    0x3CCE, 0x3CCF, 0x3CD0, 0x3CD1, 0x3CD2, 0x3CD3, 0x3CD3, 0x3CD4, 0x3CD5, 0x3CD6, 0x3CD7, 0x3CD7,
+    0x3CD8, 0x3CD9, 0x3CDA, 0x3CDB, 0x3CDC, 0x3CDC, 0x3CDD, 0x3CDE, 0x3CDF, 0x3CE0, 0x3CE0, 0x3CE1,
+    0x3CE2, 0x3CE3, 0x3CE4, 0x3CE5, 0x3CE5, 0x3CE6, 0x3CE7, 0x3CE8, 0x3CE9, 0x3CE9, 0x3CEA, 0x3CEB,
+    0x3CEC, 0x3CED, 0x3CEE, 0x3CEE, 0x3CEF, 0x3CF0, 0x3CF1, 0x3CF2, 0x3CF2, 0x3CF3, 0x3CF4, 0x3CF5,
+    0x3CF6, 0x3CF7, 0x3CF7, 0x3CF8, 0x3CF9, 0x3CFA, 0x3CFB, 0x3CFB, 0x3CFC, 0x3CFD, 0x3CFE, 0x3CFF,
+    0x3D00, 0x3D00, 0x3D01, 0x3D01, 0x3D01, 0x3D02, 0x3D02, 0x3D03, 0x3D03, 0x3D03, 0x3D04, 0x3D04,
+    0x3D05, 0x3D05, 0x3D06, 0x3D06, 0x3D06, 0x3D07, 0x3D07, 0x3D08, 0x3D08, 0x3D08, 0x3D09, 0x3D09,
+    0x3D0A, 0x3D0A, 0x3D0A, 0x3D0B, 0x3D0B, 0x3D0C, 0x3D0C, 0x3D0C, 0x3D0D, 0x3D0D, 0x3D0E, 0x3D0E,
+    0x3D0F, 0x3D0F, 0x3D0F, 0x3D10, 0x3D10, 0x3D11, 0x3D11, 0x3D11, 0x3D12, 0x3D12, 0x3D13, 0x3D13,
+    0x3D13, 0x3D14, 0x3D14, 0x3D15, 0x3D15, 0x3D16, 0x3D16, 0x3D16, 0x3D17, 0x3D17, 0x3D18, 0x3D18,
+    0x3D18, 0x3D19, 0x3D19, 0x3D1A, 0x3D1A, 0x3D1A, 0x3D1B, 0x3D1B, 0x3D1C, 0x3D1C, 0x3D1C, 0x3D1D,
+    0x3D1D, 0x3D1E, 0x3D1E, 0x3D1F, 0x3D1F, 0x3D1F, 0x3D20, 0x3D20, 0x3D21, 0x3D21, 0x3D21, 0x3D22,
+    0x3D22, 0x3D23, 0x3D23, 0x3D23, 0x3D24, 0x3D24, 0x3D25, 0x3D25, 0x3D25, 0x3D26, 0x3D26, 0x3D27,
+    0x3D27, 0x3D28, 0x3D28, 0x3D28, 0x3D29, 0x3D29, 0x3D2A, 0x3D2A, 0x3D2A, 0x3D2B, 0x3D2B, 0x3D2C,
+    0x3D2C, 0x3D2C, 0x3D2D, 0x3D2D, 0x3D2E, 0x3D2E, 0x3D2E, 0x3D2F, 0x3D2F, 0x3D30, 0x3D30, 0x3D31,
+    0x3D31, 0x3D31, 0x3D32, 0x3D32, 0x3D33, 0x3D33, 0x3D33, 0x3D34, 0x3D34, 0x3D35, 0x3D35, 0x3D35,
+    0x3D36, 0x3D36, 0x3D37, 0x3D37, 0x3D38, 0x3D38, 0x3D38, 0x3D39, 0x3D39, 0x3D3A, 0x3D3A, 0x3D3A,
+    0x3D3B, 0x3D3B, 0x3D3C, 0x3D3C, 0x3D3C, 0x3D3D, 0x3D3D, 0x3D3E, 0x3D3E, 0x3D3E, 0x3D3F, 0x3D3F,
+    0x3D40, 0x3D40, 0x3D41, 0x3D41, 0x3D41, 0x3D42, 0x3D42, 0x3D43, 0x3D43, 0x3D43, 0x3D44, 0x3D44,
+    0x3D45, 0x3D45, 0x3D45, 0x3D46, 0x3D46, 0x3D47, 0x3D47, 0x3D47, 0x3D48, 0x3D48, 0x3D49, 0x3D49,
+    0x3D4A, 0x3D4A, 0x3D4A, 0x3D4B, 0x3D4B, 0x3D4C, 0x3D4C, 0x3D4C, 0x3D4D, 0x3D4D, 0x3D4E, 0x3D4E,
+    0x3D4E, 0x3D4F, 0x3D4F, 0x3D50, 0x3D50, 0x3D50, 0x3D51, 0x3D51, 0x3D52, 0x3D52, 0x3D53, 0x3D53,
+    0x3D53, 0x3D54, 0x3D54, 0x3D55, 0x3D55, 0x3D55, 0x3D56, 0x3D56, 0x3D57, 0x3D57, 0x3D57, 0x3D58,
+    0x3D58, 0x3D59, 0x3D59, 0x3D59, 0x3D5A, 0x3D5A, 0x3D5B, 0x3D5B, 0x3D5C, 0x3D5C, 0x3D5C, 0x3D5D,
+    0x3D5D, 0x3D5E, 0x3D5E, 0x3D5E, 0x3D5F, 0x3D5F, 0x3D60, 0x3D60, 0x3D60, 0x3D61, 0x3D61, 0x3D62,
+    0x3D62, 0x3D63, 0x3D63, 0x3D63, 0x3D64, 0x3D64, 0x3D65, 0x3D65, 0x3D65, 0x3D66, 0x3D66, 0x3D67,
+    0x3D67, 0x3D67, 0x3D68, 0x3D68, 0x3D69, 0x3D69, 0x3D69, 0x3D6A, 0x3D6A, 0x3D6B, 0x3D6B, 0x3D6C,
+    0x3D6C, 0x3D6C, 0x3D6D, 0x3D6D, 0x3D6E, 0x3D6E, 0x3D6E, 0x3D6F, 0x3D6F, 0x3D70, 0x3D70, 0x3D70,
+    0x3D71, 0x3D71, 0x3D72, 0x3D72, 0x3D72, 0x3D73, 0x3D73, 0x3D74, 0x3D74, 0x3D75, 0x3D75, 0x3D75,
+    0x3D76, 0x3D76, 0x3D77, 0x3D77, 0x3D77, 0x3D78, 0x3D78, 0x3D79, 0x3D79, 0x3D79, 0x3D7A, 0x3D7A,
+    0x3D7B, 0x3D7B, 0x3D7B, 0x3D7C, 0x3D7C, 0x3D7D, 0x3D7D, 0x3D7E, 0x3D7E, 0x3D7E, 0x3D7F, 0x3D7F,
+    0x3D80, 0x3D80, 0x3D80, 0x3D80, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D81, 0x3D82, 0x3D82, 0x3D82,
+    0x3D82, 0x3D82, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D83, 0x3D84, 0x3D84, 0x3D84, 0x3D84, 0x3D85,
+    0x3D85, 0x3D85, 0x3D85, 0x3D85, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D86, 0x3D87, 0x3D87, 0x3D87,
+    0x3D87, 0x3D87, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D88, 0x3D89, 0x3D89, 0x3D89, 0x3D89, 0x3D89,
+    0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8A, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8B, 0x3D8C, 0x3D8C,
+    0x3D8C, 0x3D8C, 0x3D8C, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8D, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E, 0x3D8E,
+    0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D8F, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D90, 0x3D91, 0x3D91,
+    0x3D91, 0x3D91, 0x3D91, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D92, 0x3D93, 0x3D93, 0x3D93, 0x3D93,
+    0x3D93, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D94, 0x3D95, 0x3D95, 0x3D95, 0x3D95, 0x3D96, 0x3D96,
+    0x3D96, 0x3D96, 0x3D96, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D97, 0x3D98, 0x3D98, 0x3D98, 0x3D98,
+    0x3D98, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D99, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9A, 0x3D9B,
+    0x3D9B, 0x3D9B, 0x3D9B, 0x3D9B, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9C, 0x3D9D, 0x3D9D, 0x3D9D,
+    0x3D9D, 0x3D9D, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9E, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3D9F, 0x3DA0,
+    0x3DA0, 0x3DA0, 0x3DA0, 0x3DA0, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA1, 0x3DA2, 0x3DA2, 0x3DA2,
+    0x3DA2, 0x3DA2, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA3, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4, 0x3DA4,
+    0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA5, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA6, 0x3DA7, 0x3DA7, 0x3DA7,
+    0x3DA7, 0x3DA7, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA8, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9, 0x3DA9,
+    0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAA, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAB, 0x3DAC, 0x3DAC,
+    0x3DAC, 0x3DAC, 0x3DAC, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAD, 0x3DAE, 0x3DAE, 0x3DAE, 0x3DAE,
+    0x3DAE, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DAF, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB0, 0x3DB1, 0x3DB1,
+    0x3DB1, 0x3DB1, 0x3DB1, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB2, 0x3DB3, 0x3DB3, 0x3DB3, 0x3DB3,
+    0x3DB3, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB4, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB5, 0x3DB6,
+    0x3DB6, 0x3DB6, 0x3DB6, 0x3DB6, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB7, 0x3DB8, 0x3DB8, 0x3DB8, 0x3DB8,
+    0x3DB8, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DB9, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBA, 0x3DBB,
+    0x3DBB, 0x3DBB, 0x3DBB, 0x3DBB, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBC, 0x3DBD, 0x3DBD, 0x3DBD,
+    0x3DBD, 0x3DBD, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBE, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF, 0x3DBF,
+    0x3DC0, 0x3DC0, 0x3DC0, 0x3DC0, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC1, 0x3DC2, 0x3DC2, 0x3DC2,
+    0x3DC2, 0x3DC2, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC3, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4, 0x3DC4,
+    0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC5, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC6, 0x3DC7, 0x3DC7,
+    0x3DC7, 0x3DC7, 0x3DC7, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC8, 0x3DC9, 0x3DC9, 0x3DC9, 0x3DC9,
+    0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCA, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCB, 0x3DCC, 0x3DCC,
+    0x3DCC, 0x3DCC, 0x3DCC, 0x3DCD, 0x3DCE, 0x3DCF, 0x3DD0, 0x3DD1, 0x3DD2, 0x3DD3, 0x3DD4, 0x3DD5,
+    0x3DD6, 0x3DD7, 0x3DD8, 0x3DD9, 0x3DDA, 0x3DDB, 0x3DDC, 0x3DDD, 0x3DDE, 0x3DDF, 0x3DE0, 0x3DE1,
+    0x3DE2, 0x3DE3, 0x3DE4, 0x3DE5,
+};
+
+static uint16_t test_pattern_ref[] = {
+    0x0,    0x3c24, 0x3c68, 0x3c8e, 0x3ca4, 0x3cb7, 0x3cc8, 0x3cd9, 0x3ce8, 0x3cf6, 0x3d01, 0x3d08,
+    0x3d0e, 0x3d14, 0x3d19, 0x3d1f, 0x3d24, 0x3d29, 0x3d2e, 0x3d33, 0x3d37, 0x3d3c, 0x3d40, 0x3d45,
+    0x3d48, 0x3d4d, 0x3d51, 0x3d55, 0x3d59, 0x3d5d, 0x3d61, 0x3d64, 0x3d68, 0x3d6b, 0x3d6f, 0x3d72,
+    0x3d76, 0x3d79, 0x3d7c, 0x3d80, 0x3d81, 0x3d83, 0x3d85, 0x3d86, 0x3d88, 0x3d89, 0x3d8b, 0x3d8c,
+    0x3d8e, 0x3d90, 0x3d91, 0x3d92, 0x3d94, 0x3d95, 0x3d97, 0x3d98, 0x3d99, 0x3d9b, 0x3d9c, 0x3d9d,
+    0x3d9f, 0x3da0, 0x3da1, 0x3da2, 0x3da4, 0x3da5, 0x3da6, 0x3da8, 0x3da9, 0x3daa, 0x3dab, 0x3dad,
+    0x3dae, 0x3daf, 0x3db0, 0x3db1, 0x3db3, 0x3db4, 0x3db5, 0x3db6, 0x3db7, 0x3db9, 0x3db9, 0x3dbb,
+    0x3dbc, 0x3dbd, 0x3dbe, 0x3dbf, 0x3dc0, 0x3dc1, 0x3dc2, 0x3dc3, 0x3dc5, 0x3dc5, 0x3dc7, 0x3dc8,
+    0x3dc8, 0x3dca, 0x3dcb, 0x3dcc, 0x3dcd, 0x3dce, 0x3dcf, 0x3dd0, 0x3dd1, 0x3dd2, 0x3dd3, 0x3dd4,
+    0x3dd5, 0x3dd6, 0x3dd7, 0x3dd8, 0x3dd9, 0x3dda, 0x3ddb, 0x3ddb, 0x3ddd, 0x3dde, 0x3dde, 0x3ddf,
+    0x3de1, 0x3de1, 0x3de2, 0x3de3, 0x3de4, 0x3de5, 0x3de6, 0x3de7, 0x3de8, 0x3de8, 0x3dea, 0x3deb,
+    0x3deb, 0x3dec, 0x3ded, 0x3dee, 0x3def, 0x3def, 0x3df1, 0x3df2, 0x3df2, 0x3df3, 0x3df4, 0x3df5,
+    0x3df6, 0x3df7, 0x3df7, 0x3df8, 0x3df9, 0x3dfa, 0x3dfb, 0x3dfb, 0x3dfc, 0x3dfd, 0x3dfe, 0x3dff,
+    0x3e00, 0x3e00, 0x3e00, 0x3e01, 0x3e01, 0x3e02, 0x3e02, 0x3e03, 0x3e03, 0x3e03, 0x3e04, 0x3e04,
+    0x3e05, 0x3e05, 0x3e05, 0x3e06, 0x3e06, 0x3e07, 0x3e07, 0x3e07, 0x3e08, 0x3e08, 0x3e09, 0x3e09,
+    0x3e09, 0x3e0a, 0x3e0a, 0x3e0b, 0x3e0b, 0x3e0b, 0x3e0b, 0x3e0c, 0x3e0c, 0x3e0d, 0x3e0d, 0x3e0d,
+    0x3e0e, 0x3e0e, 0x3e0f, 0x3e0f, 0x3e10, 0x3e10, 0x3e10, 0x3e10, 0x3e11, 0x3e11, 0x3e11, 0x3e12,
+    0x3e12, 0x3e13, 0x3e13, 0x3e14, 0x3e14, 0x3e14, 0x3e14, 0x3e15, 0x3e15, 0x3e15, 0x3e16, 0x3e16,
+    0x3e17, 0x3e17, 0x3e17, 0x3e17, 0x3e18, 0x3e18, 0x3e19, 0x3e19, 0x3e19, 0x3e19, 0x3e1a, 0x3e1a,
+    0x3e1b, 0x3e1b, 0x3e1b, 0x3e1c, 0x3e1c, 0x3e1c, 0x3e1d, 0x3e1d, 0x3e1d, 0x3e1e, 0x3e1e, 0x3e1e,
+    0x3e1f, 0x3e1f, 0x3e1f, 0x3e20, 0x3e20, 0x3e20, 0x3e21, 0x3e21, 0x3e21, 0x3e22, 0x3e22, 0x3e22,
+    0x3e22, 0x3e23, 0x3e23, 0x3e24, 0x3e24, 0x3e24, 0x3e24, 0x3e25, 0x3e25, 0x3e26, 0x3e26, 0x3e26,
+    0x3e26, 0x3e27, 0x3e27, 0x3e27, 0x3e28, 0x3e28, 0x3e28, 0x3e29, 0x3e29, 0x3e29, 0x3e29, 0x3e2a,
+    0x3e2a, 0x3e2a, 0x3e2b, 0x3e2b, 0x3e2b, 0x3e2c, 0x3e2c, 0x3e2c, 0x3e2d, 0x3e2d, 0x3e2d, 0x3e2d,
+    0x3e2e, 0x3e2e, 0x3e2f, 0x3e2f, 0x3e2f, 0x3e2f, 0x3e30, 0x3e30, 0x3e30, 0x3e30, 0x3e31, 0x3e31,
+    0x3e31, 0x3e32, 0x3e32, 0x3e32, 0x3e33, 0x3e33, 0x3e33, 0x3e33, 0x3e34, 0x3e34, 0x3e34, 0x3e35,
+    0x3e35, 0x3e35, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e36, 0x3e37, 0x3e37, 0x3e37, 0x3e38, 0x3e38,
+    0x3e39, 0x3e39, 0x3e39, 0x3e39, 0x3e39, 0x3e3a, 0x3e3a, 0x3e3b, 0x3e3b, 0x3e3b, 0x3e3b, 0x3e3b,
+    0x3e3c, 0x3e3c, 0x3e3c, 0x3e3d, 0x3e3d, 0x3e3d, 0x3e3d, 0x3e3d, 0x3e3e, 0x3e3e, 0x3e3f, 0x3e3f,
+    0x3e3f, 0x3e3f, 0x3e3f, 0x3e40, 0x3e40, 0x3e41, 0x3e41, 0x3e41, 0x3e41, 0x3e41, 0x3e42, 0x3e42,
+    0x3e42, 0x3e43, 0x3e43, 0x3e43, 0x3e43, 0x3e44, 0x3e44, 0x3e44, 0x3e45, 0x3e45, 0x3e45, 0x3e45,
+    0x3e45, 0x3e46, 0x3e46, 0x3e47, 0x3e47, 0x3e47, 0x3e47, 0x3e47, 0x3e48, 0x3e48, 0x3e48, 0x3e48,
+    0x3e48, 0x3e49, 0x3e49, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4a, 0x3e4b, 0x3e4b, 0x3e4b, 0x3e4c,
+    0x3e4c, 0x3e4c, 0x3e4c, 0x3e4c, 0x3e4d, 0x3e4d, 0x3e4e, 0x3e4e, 0x3e4e, 0x3e4e, 0x3e4e, 0x3e4f,
+    0x3e4f, 0x3e4f, 0x3e4f, 0x3e4f, 0x3e50, 0x3e50, 0x3e51, 0x3e51, 0x3e51, 0x3e51, 0x3e51, 0x3e52,
+    0x3e52, 0x3e52, 0x3e52, 0x3e52, 0x3e53, 0x3e53, 0x3e53, 0x3e54, 0x3e54, 0x3e54, 0x3e54, 0x3e55,
+    0x3e55, 0x3e55, 0x3e55, 0x3e55, 0x3e56, 0x3e56, 0x3e56, 0x3e57, 0x3e57, 0x3e57, 0x3e57, 0x3e57,
+    0x3e58, 0x3e58, 0x3e58, 0x3e58, 0x3e59, 0x3e59, 0x3e59, 0x3e5a, 0x3e5a, 0x3e5a, 0x3e5a, 0x3e5a,
+    0x3e5b, 0x3e5b, 0x3e5b, 0x3e5b, 0x3e5b, 0x3e5c, 0x3e5c, 0x3e5d, 0x3e5d, 0x3e5d, 0x3e5d, 0x3e5d,
+    0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5e, 0x3e5f, 0x3e5f, 0x3e5f, 0x3e5f, 0x3e5f, 0x3e60, 0x3e60,
+    0x3e61, 0x3e61, 0x3e61, 0x3e61, 0x3e61, 0x3e62, 0x3e62, 0x3e62, 0x3e62, 0x3e62, 0x3e63, 0x3e63,
+    0x3e63, 0x3e63, 0x3e63, 0x3e64, 0x3e64, 0x3e65, 0x3e65, 0x3e65, 0x3e65, 0x3e65, 0x3e66, 0x3e66,
+    0x3e66, 0x3e66, 0x3e66, 0x3e67, 0x3e67, 0x3e67, 0x3e67, 0x3e67, 0x3e68, 0x3e68, 0x3e68, 0x3e68,
+    0x3e68, 0x3e69, 0x3e69, 0x3e6a, 0x3e6a, 0x3e6a, 0x3e6a, 0x3e6a, 0x3e6b, 0x3e6b, 0x3e6b, 0x3e6b,
+    0x3e6b, 0x3e6c, 0x3e6c, 0x3e6c, 0x3e6c, 0x3e6c, 0x3e6d, 0x3e6d, 0x3e6d, 0x3e6d, 0x3e6d, 0x3e6e,
+    0x3e6e, 0x3e6e, 0x3e6e, 0x3e6e, 0x3e6f, 0x3e6f, 0x3e6f, 0x3e6f, 0x3e6f, 0x3e70, 0x3e70, 0x3e71,
+    0x3e71, 0x3e71, 0x3e71, 0x3e71, 0x3e72, 0x3e72, 0x3e72, 0x3e72, 0x3e72, 0x3e73, 0x3e73, 0x3e73,
+    0x3e73, 0x3e73, 0x3e74, 0x3e74, 0x3e74, 0x3e74, 0x3e74, 0x3e75, 0x3e75, 0x3e75, 0x3e75, 0x3e76,
+    0x3e76, 0x3e76, 0x3e76, 0x3e76, 0x3e77, 0x3e77, 0x3e77, 0x3e77, 0x3e77, 0x3e78, 0x3e78, 0x3e78,
+    0x3e78, 0x3e78, 0x3e79, 0x3e79, 0x3e79, 0x3e79, 0x3e79, 0x3e7a, 0x3e7a, 0x3e7a, 0x3e7a, 0x3e7a,
+    0x3e7b, 0x3e7b, 0x3e7b, 0x3e7b, 0x3e7b, 0x3e7c, 0x3e7c, 0x3e7c, 0x3e7c, 0x3e7c, 0x3e7d, 0x3e7d,
+    0x3e7d, 0x3e7d, 0x3e7d, 0x3e7e, 0x3e7e, 0x3e7e, 0x3e7e, 0x3e7f, 0x3e7f, 0x3e7f, 0x3e7f, 0x3e7f,
+    0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e80, 0x3e81, 0x3e81, 0x3e81,
+    0x3e81, 0x3e81, 0x3e81, 0x3e81, 0x3e81, 0x3e81, 0x3e81, 0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e82,
+    0x3e82, 0x3e82, 0x3e82, 0x3e82, 0x3e83, 0x3e83, 0x3e83, 0x3e83, 0x3e83, 0x3e83, 0x3e83, 0x3e83,
+    0x3e83, 0x3e83, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84, 0x3e84,
+    0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e85, 0x3e86, 0x3e86,
+    0x3e86, 0x3e86, 0x3e86, 0x3e86, 0x3e86, 0x3e86, 0x3e86, 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e87,
+    0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e87, 0x3e88, 0x3e88, 0x3e88, 0x3e88, 0x3e88, 0x3e88, 0x3e88,
+    0x3e88, 0x3e88, 0x3e88, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89, 0x3e89,
+    0x3e89, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8a, 0x3e8b, 0x3e8b,
+    0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b, 0x3e8b,
+    0x3e8b, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8c, 0x3e8d,
+    0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8d, 0x3e8e, 0x3e8e, 0x3e8e,
+    0x3e8e, 0x3e8e, 0x3e8e, 0x3e8e, 0x3e8e, 0x3e8e, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f,
+    0x3e8f, 0x3e8f, 0x3e8f, 0x3e8f, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90,
+    0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e90, 0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e91,
+    0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e91, 0x3e92, 0x3e92, 0x3e92, 0x3e92, 0x3e92, 0x3e92, 0x3e92,
+    0x3e92, 0x3e92, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93, 0x3e93,
+    0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94, 0x3e94,
+    0x3e94, 0x3e94, 0x3e94, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95, 0x3e95,
+    0x3e95, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e96, 0x3e97, 0x3e97,
+    0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97, 0x3e97,
+    0x3e97, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e98, 0x3e99,
+    0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99, 0x3e99,
+    0x3e99, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9a, 0x3e9b,
+    0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9b, 0x3e9c, 0x3e9c, 0x3e9c,
+    0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c, 0x3e9c,
+    0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9d, 0x3e9e, 0x3e9e, 0x3e9e,
+    0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e, 0x3e9e,
+    0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3e9f, 0x3ea0, 0x3ea0,
+    0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0, 0x3ea0,
+    0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea1, 0x3ea2, 0x3ea2,
+    0x3ea2, 0x3ea2, 0x3ea2, 0x3ea2, 0x3ea2, 0x3ea3, 0x3ea3, 0x3ea4, 0x3ea4, 0x3ea4, 0x3ea5, 0x3ea5,
+    0x3ea6, 0x3ea6, 0x3ea6, 0x3ea7, 0x3ea7, 0x3ea7, 0x3ea8, 0x3ea8, 0x3ea9, 0x3ea9, 0x3ea9, 0x3eaa,
+    0x3eaa, 0x3eaa, 0x3eab, 0x3eab,
+};
+
+static void tl_lut_ref(uint16_t *ofmap, uint16_t *ifmap, cvk_tl_shape_t ifmap_shape) {
+  for (uint32_t i = 0; i < tl_shape_size(&ifmap_shape); i++) {
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      ofmap[i] = test_pattern_ref[i];
+    } else {
+      ofmap[i] = convert_fp32_bf16(pow(convert_bf16_fp32(ifmap[i]), 0.5));
+    }
+  }
+}
+
+static bool verify(uint16_t *ofmap_data, uint16_t *ref_data, uint16_t *ifmap,
+                   uint64_t ifmap_shape_size, TEST_MODE mode) {
+  uint64_t size = ifmap_shape_size;
+
+  for (uint64_t i = 0; i < size; i++) {
+    bool is_close;
+    uint16_t ref;
+    uint16_t ofmap_data_bf16;
+    float ref_f;
+    float ofmap_data_f;
+
+    ref = ref_data[i];
+    ref_f = convert_bf16_fp32(ref);
+    ofmap_data_f = convert_bf16_fp32(ofmap_data[i]);
+    ofmap_data_bf16 = ofmap_data[i];
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      is_close = ofmap_data[i] == ref;
+    } else {
+      is_close = fabs(ref_f - ofmap_data_f) < 0.001;
+    }
+
+    if (!is_close) {
+      fprintf(stderr,
+              "comparing failed at ofmap_data[%lu](input:%e), got %x, exp %x, "
+              "fp32: got %e exp %e\n",
+              i, convert_bf16_fp32(ifmap[i]), ofmap_data_bf16, ref, ofmap_data_f, ref_f);
+      exit(-1);
+    }
+  }
+
+  return true;
+}
+
+static void gen_input(uint16_t *ifmap, uint64_t ifmap_shape_size) {
+  if (mode == PRE_DATA_COMPARE_FIX) {
+    memcpy(ifmap, &test_pattern, sizeof(test_pattern));
+  } else {
+    for (uint64_t i = 0; i < ifmap_shape_size; i++) {
+      srand(static_cast<unsigned>(time(0)));
+      std::random_device rd;
+      std::mt19937 e2(rd());
+      float LO = pow(2, -10);
+      float HI = pow(2, 10);
+      // std::uniform_real_distribution<> dist(pow(2,-62), pow(2,63));
+      for (uint64_t i = 0; i < ifmap_shape_size; i++) {
+        // float r3 = dist(e2);
+        float r3 = LO + static_cast<float>(rand()) / (static_cast<float>(RAND_MAX / (HI - LO)));
+        ifmap[i] = convert_fp32_bf16(r3);
+      }
+    }
+  }
+
+#ifdef DBG
+  for (uint64_t i = 0; i < ifmap_shape_size; i++) {
+    printf("source if[%lu] bf16 %f 0x%x, log2f is %f\n", i, convert_bf16_fp32(ifmap[i]), ifmap[i],
+           floor(log2((convert_bf16_fp32(ifmap[i])))));
+  }
+#endif /* ifdef DBG */
+}
+
+static void testbench(CVI_RT_HANDLE *ctx, cvk_context_t *bmk, uint32_t input_n, uint32_t input_c,
+                      uint32_t input_h, uint32_t input_w) {
+  cvk_fmt_t fmt = CVK_FMT_BF16;
+
+  // TODO: check more shape / align
+  cvk_tl_shape_t ifmap_shape = {input_n, input_c, input_h, input_w};
+  cvk_tl_shape_t ofmap_shape = ifmap_shape;
+  cvk_tl_shape_t table_shape;
+  cvm_table_shape(bmk, &table_shape);
+
+  uint64_t ifmap_shape_size = tl_shape_size(&ifmap_shape);
+  uint64_t ofmap_size = tl_shape_size(&ofmap_shape);
+  uint64_t table_size = tl_shape_size(&table_shape);
+
+  // prepare input data with size
+  int data_type_size = bytesize_of_fmt(fmt);
+  uint64_t ifmap_bytesize = ifmap_shape_size * data_type_size;
+  uint64_t ofmap_bytesize = ofmap_size * data_type_size;
+  uint64_t table_bytesize = table_size * data_type_size;
+
+  uint16_t *ifmap = (uint16_t *)xmalloc(ifmap_bytesize);
+  uint16_t *ref_data = (uint16_t *)xmalloc(ofmap_bytesize);
+  uint16_t *table_data = (uint16_t *)xmalloc(table_bytesize);
+  uint16_t *table_data_mantissa = (uint16_t *)xmalloc(table_bytesize);
+
+  // alloc lmem
+  cvk_tl_t *tl_ifmap = test_alloc_tl(bmk, ifmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_ofmap_bf16 = test_alloc_tl(bmk, ofmap_shape, fmt, /*align*/ 1);
+  cvk_tl_t *tl_buf = test_alloc_tl(bmk, tl_ifmap->shape, fmt, /*align*/ 1);
+  cvk_tl_t *cvk_tl_table_answer = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+  cvk_tl_t *cvk_tl_table_answer_mantissa = test_alloc_tl(bmk, table_shape, fmt, /*align*/ 1);
+
+  // generate testbench
+  gen_input(ifmap, ifmap_shape_size);
+  tl_lut_ref(ref_data, ifmap, ifmap_shape);
+
+  // prepare table
+  cvm_sqrt_tbl(table_data, table_data_mantissa, &table_shape);
+
+  // sys->lmem
+  test_put_tensor_g2l_comp(ctx, bmk, tl_ifmap, (uint8_t *)ifmap);
+  test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer, (uint8_t *)table_data);
+  test_put_tensor_g2l_comp(ctx, bmk, cvk_tl_table_answer_mantissa, (uint8_t *)table_data_mantissa);
+
+  cvm_emit_sqrt(bmk, tl_ifmap, tl_buf, cvk_tl_table_answer, cvk_tl_table_answer_mantissa,
+                tl_ofmap_bf16);
+
+  // issue cmd
+  test_submit_comp(ctx, bmk);
+
+  // get output from lmem->sys
+  uint16_t *ofmap_data = (uint16_t *)test_get_tensor_l2g_comp(ctx, bmk, tl_ofmap_bf16);
+
+  verify(ofmap_data, ref_data, ifmap, ifmap_shape_size, mode);
+
+  free_tl(bmk, cvk_tl_table_answer_mantissa);
+  free_tl(bmk, cvk_tl_table_answer);
+  free_tl(bmk, tl_buf);
+  free_tl(bmk, tl_ofmap_bf16);
+  free_tl(bmk, tl_ifmap);
+
+  free(ifmap);
+  free(ref_data);
+  free(ofmap_data);
+  free(table_data);
+  free(table_data_mantissa);
+}
+
+int main() {
+  CVI_RT_HANDLE ctx;
+  cvk_context_t *bmk;
+  int round_mode;
+
+  round_mode = set_store_feround();
+
+  test_init(&ctx, &bmk);
+
+  for (int i = PRE_DATA_COMPARE_FIX; i < TEST_MODE_MAX; i++) {
+    mode = static_cast<TEST_MODE>(i);
+    printf("test mode %d...\n", mode);
+
+    int input_n = 1;
+    int input_c = 32;
+    int input_h = 1;
+    int input_w = 1;
+
+    if (mode == PRE_DATA_COMPARE_FIX) {
+      input_h = 4;
+      input_w = 8;
+    } else {
+      input_h = input_w = 16;
+    }
+
+    testbench(&ctx, bmk, input_n, input_c, input_h, input_w);
+  }
+
+  test_exit(&ctx, bmk);
+  restore_feround(round_mode);
+  return 0;
+}
diff --git a/cvimath/tests/include/test_native_ref.h b/cvimath/tests/include/test_native_ref.h
new file mode 100644
index 000000000..34ec5ecd9
--- /dev/null
+++ b/cvimath/tests/include/test_native_ref.h
@@ -0,0 +1,383 @@
+#ifndef _BM_NATIVE_REF_H_
+#define _BM_NATIVE_REF_H_
+
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef union {
+  uint32_t ival;
+  float fval;
+} IF_VAL;
+
+/*
+ * fp32 version
+ */
+
+int array_cmp_float(const char *const info, float *p_exp, float *p_got, int count, float delta);
+int array_cmp_int(const char *const info, int *p_exp, int *p_got, int count);
+
+/**
+ * @name    calc_dilute_hw
+ * @brief   calculate diluted dimention
+ * @ingroup libbmutils
+ *
+ * @param [in] h       origin dimention
+ * @param [in] ins_h   scaleing factor, 0 -> no scaling
+ * @param [in] ins_h_l compensation value after last value in each row
+ * @param [in] pad_h_b extra padding left ofr bottom
+ * @param [in] pad_h_t extra padding right or top
+ *
+ * @retval diluted value
+ */
+int calc_dilute_hw(int h, int ins_h, int ins_h_l, int pad_h_b, int pad_h_t);
+
+/**
+ * @name    calc_output_hw
+ * @brief   calculate output dimention by kernel and stride size
+ * @ingroup libbmutils
+ *
+ * @param [in] hw       origin dimention
+ * @param [in] kwh      scaling factor, 0 -> no scaling
+ * @param [in] stride   compensation value after last value in each row
+ *
+ * @retval output dimention
+ */
+int calc_output_hw(int hw, int khw, int stride);
+
+/**
+ * @name    fill_pad_fmap_fp32
+ * @brief   fill padded feature map with unpadded map
+ * @ingroup libbmutils
+ *
+ * @param [in]  before       input array
+ * @param [out] pbefore      output array reference, if NULL, alloc a new one
+ * @param [in]  pad_val      padding value
+ * @param [in]  pad_l        padding left size
+ * @param [in]  pad_r        padding right size
+ * @param [in]  pad_t        padding top size
+ * @param [in]  pad_b        padding bottom size
+ * @param [in]  ins_h        scaling factor h
+ * @param [in]  ins_w        scaling factor w
+ * @param [in]  ins_h_last   compensation value after last value in each row
+ * @param [in]  ins_w_last   compensation value after last value in each col
+ * @param [in]  h_before     origin height
+ * @param [in]  w_before     origin width
+ *
+ * @retval BM_SUCCESS               success
+ * @retval BM_ERR_INVALID_ARGUMENT  before or pafter is null pointer
+ * @retval BM_ERR_NOMEM             can't alloc new output array
+ */
+int fill_pad_fmap_fp32(const float *before, float **after, float pad_value, int pad_t, int pad_b,
+                       int pad_l, int pad_r, int ins_h, int ins_w, int ins_h_last, int ins_w_last,
+                       int h_before, int w_before);
+
+void native_md_scalar(float *a, float *b, float *r, int N, int C, int H, int W, int op,
+                      bool result_add);
+
+void native_conv_ref(const void *ifmap, void *ofmap, const void *weight, int input_n, int input_c,
+                     int input_h, int input_w, int output_c, int output_h, int output_w, int groups,
+                     int kh, int kw, int dilation_h, int dilation_w, int pad_h, int pad_w,
+                     int stride_h, int stride_w, int flip, int using_bias, const void *bias,
+                     int result_add);
+
+void native_pooling_forward_max(const float *bottom_data, float *top_data, int *mask_data,
+                                const int count, const int num, const int channels,
+                                const int height, const int width, const int pooled_height,
+                                const int pooled_width, const int kernel_h, const int kernel_w,
+                                const int stride_h, const int stride_w, const int pad_h,
+                                const int pad_w);
+
+void native_pooling_forward_ave(const float *bottom_data, float *top_data, const int count,
+                                const int num, const int channels, const int height,
+                                const int width, const int pooled_height, const int pooled_width,
+                                const int kernel_h, const int kernel_w, const int stride_h,
+                                const int stride_w, const int pad_h, const int pad_w);
+
+/*
+ *  int8 vresion
+ */
+
+/**
+ * @name    array_cmp_int8
+ * @brief   compare the contect of p_exp and p_got and print the error index
+ *          and value
+ * @ingroup libbmutils
+ *
+ * @param [in] info   informataion string printed when encounter error
+ * @param [in]  p_exp  input array
+ * @param [in]  p_got  length of input array
+ * @param [in]  len    length of input array
+ * @retval      0      no error
+ * @retval      -1     error occur
+ */
+int array_cmp_int8(const char *const info, const int8_t *p_exp, const int8_t *p_got, int count);
+
+/**
+ * @name    fill_pad_fmap_int8
+ * @brief   fill padded feature map with unpadded map
+ * @ingroup libbmutils
+ *
+ * @param [in]  before       input array
+ * @param [out] pbefore      output array reference, if NULL, alloc a new one
+ * @param [in]  pad_val      padding value
+ * @param [in]  pad_l        padding left size
+ * @param [in]  pad_r        padding right size
+ * @param [in]  pad_t        padding top size
+ * @param [in]  pad_b        padding bottom size
+ * @param [in]  ins_h        scaling factor h
+ * @param [in]  ins_w        scaling factor w
+ * @param [in]  ins_h_last   compensation value after last value in each row
+ * @param [in]  ins_w_last   compensation value after last value in each col
+ * @param [in]  h_before     origin height
+ * @param [in]  w_before     origin width
+ *
+ * @retval BM_SUCCESS               success
+ * @retval BM_ERR_INVALID_ARGUMENT  before or pafter is null pointer
+ * @retval BM_ERR_NOMEM             can't alloc new output array
+ */
+int fill_pad_fmap_int8(const int8_t *before, int8_t **pafter, int pad_val, int pad_l, int pad_r,
+                       int pad_t, int pad_b, int ins_h, int ins_w, int ins_h_last, int ins_w_last,
+                       int h_before, int w_before);
+
+int fill_pad_fmap_bf16(const unsigned short *before, unsigned short **pafter, int pad_val,
+                       int pad_l, int pad_r, int pad_t, int pad_b, int ins_h, int ins_w,
+                       int ins_h_last, int ins_w_last, int h_before, int w_before);
+
+/**
+ * @name    fill_int_with_int8
+ * @brief   (int) pdest[i] = (int8_t)pdest[i] for each element
+ * @ingroup libbmutils
+ *
+ * @param [out] pdest  output array
+ * @param [in]  psrc   input array
+ * @param [in]  len    length of input array
+ */
+void fill_int_with_int8(int *pdest, int8_t *psrc, int len);
+
+/**
+ * @name    fill_int_with_uint8
+ * @brief   (int) pdest[i] = (int16_t)pdest[i] for each element
+ * @ingroup libbmutils
+ *
+ * @param [out] pdest  output array
+ * @param [in]  psrc   input array
+ * @param [in]  len    length of input array
+ */
+void fill_int_with_uint8(int *pdest, uint8_t *psrc, int len);
+
+/**
+ * @name    fill_int_with_int16
+ * @brief   (int) pdest[i] = (int16_t)pdest[i] for each element
+ * @ingroup libbmutils
+ *
+ * @param [out] pdest  output array
+ * @param [in]  psrc   input array
+ * @param [in]  len    length of input array
+ */
+void fill_int_with_int16(int *pdest, int16_t *psrc, int len);
+
+void native_md_scalar_int8(int8_t *a, int8_t *b, int8_t *r, int N, int C, int H, int W, int op,
+                           bool result_add);
+
+/**
+ * @name    inner_product
+ * @brief   inner product of two array
+ * @ingroup libbmutils
+ *
+ * @param [in]  a    input array 0
+ * @param [in]  b    input array 1
+ * @param [in]  len  length of a or b
+ * @param [out] c    store the summation
+ */
+void inner_product(const int *a, const int *b, int len, int *c);
+void inner_float_product(const float *a, const float *b, int len, float *c);
+
+/**
+ * @name    native_conv_int8
+ * @brief   do convolution specific 8bit feature map
+ * @ingroup libbmutils
+ *
+ * @param [in]  ifmap         input array
+ * @param [in]  weight        weight data array
+ * @param [in]  bias          bias array if !NULL, add bias
+ * @param [out] ofmap         lenght of input array
+ * @param [in]  in            input batch size
+ * @param [in]  ic            input channel size
+ * @param [in]  ih            input height
+ * @param [in]  iw            input width
+ * @param [in]  oc            output channle size
+ * @param [in]  kh            kernel height
+ * @param [in]  kw            kernel width
+ * @param [in]  dh            kernel dilute height factor
+ * @param [in]  dw            kernel dilute width factor
+ * @param [in]  pad_h_t       padding top size
+ * @param [in]  pad_h_b       padding bottom size
+ * @param [in]  pad_w_l       padding left size
+ * @param [in]  pad_w_r       padding right size
+ * @param [in]  stride_h      stride height
+ * @param [in]  stride_w      stride width
+ * @param [in]  ins_h         insert extra element for each i_fmap row
+ * @param [in]  ins_w         insert extra element for each i_fmap col
+ * @param [in]  ins_h_last    insert extra element for last i_fmap row
+ * @param [in]  ins_w_last    insert extra element for last i_fmap col
+ * @param [in]  input_sign    i_fmap data type. 0 => signed, 1 => unsigned
+ * @param [in]  r_shift_width scale bit for saturation
+ *
+ * @retval BM_SUCCESS               success
+ * @retval other                    saturation failed
+ */
+int native_conv_int8(const int8_t *ifmap, const int8_t *weight, const int16_t *bias, int8_t *ofmap,
+                     int in, int ic, int ih, int iw, int oc, int kh, int kw, int dh, int dw,
+                     int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h, int stride_w,
+                     int ins_h, int ins_w, int ins_h_last, int ins_w_last, int input_sign,
+                     int r_shift_width, int do_relu);
+
+/**
+ * @name    native_fc_int8
+ * @brief   do full-connected layer for specific feature map
+ * @ingroup libbmutils
+ *
+ * @param [in]  L              input array
+ * @param [in]  R              weight array
+ * @param [in]  B              bias array if !NULL, add bias
+ * @param [in]  Y              accumulation array if !NULL, add this
+ * @param [out] Y_ref          output array
+ * @param [in]  L_row_num      input row size
+ * @param [in]  L_col_num      input col size
+ * @param [in]  R_col_num      weight
+ * @param [in]  L_sign         padding top size
+ * @param [in]  R_sign         padding top size
+ * @param [in]  B_sign         padding top size
+ * @param [in]  L_shift_width  padding top size
+ * @param [in]  R_shift_width  padding top size
+ * @param [in]  is_result_int8 padding top size
+ * @param [in]  do_relu        padding top size
+ *
+ * @retval BM_SUCCESS               success
+ * @retval other                    saturation failed
+ */
+int native_fc_int8(const int8_t *L, const int8_t *R, const int16_t *B, const int16_t *Y, int *Y_ref,
+                   int L_row_num, int L_col_num, int R_col_num, int L_sign, int R_sign, int B_sign,
+                   int l_shift_width, int r_shift_width, int is_result_int8, int do_relu);
+
+/**
+ * @name    native_pooling_ave_int8
+ * @brief   do average pooling for specific feature map
+ * @ingroup libbmutils
+ *
+ * @param [in]  i_fmap        input array
+ * @param [in]  weight        weight data array
+ * @param [in]  bias          bias array if !NULL, add bias
+ * @param [out] o_fmap        lenght of input array
+ * @param [in]  pad_h_t       padding top size
+ * @param [in]  pad_h_b       padding bottom size
+ * @param [in]  pad_w_l       padding left size
+ * @param [in]  pad_w_r       padding right size
+ * @param [in]  stride_h      stride height
+ * @param [in]  stride_w      stride width
+ * @param [in]  ins_h         insert extra element for each i_fmap row
+ * @param [in]  ins_w         insert extra element for each i_fmap col
+ * @param [in]  ins_h_last    insert extra element for last i_fmap row
+ * @param [in]  ins_w_last    insert extra element for last i_fmap col
+ * @param [in]  input_sign    i_fmap data type. 0 => signed, 1 => unsigned
+ * @param [in]  satu_sign     saturation data type. 0 => unsigned, 1 => signed
+ * @param [in]  r_shift_width scale bit for saturation
+ * @param [in]  const_weight  if weight array has one uint8_t value
+ *
+ * @retval BM_SUCCESS               success
+ * @retval BM_ERR_INVALID_ARGUMENT  illegal kh/kw or r_shift_width
+ */
+int native_pooling_ave_int8(const int8_t *i_fmap, const void *weight, const int16_t *bias,
+                            int8_t *o_fmap, int input_n, int input_c, int input_h, int input_w,
+                            int kh, int kw, int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r,
+                            int stride_h, int stride_w, int ins_w, int ins_h, int ins_w_last,
+                            int ins_h_last, int input_sign, int satu_sign, int r_shift_width,
+                            int const_weight);
+
+/**
+ * @name    native_pooling_max_int8
+ * @brief   do max pooling for specific feature map
+ * @ingroup libbmutils
+ *
+ * @param [in]  i_fmap        input array
+ * @param [out] o_fmap        lenght of input array
+ * @param [in]  pad_h_t       padding top size
+ * @param [in]  pad_h_b       padding bottom size
+ * @param [in]  pad_w_l       padding left size
+ * @param [in]  pad_w_r       padding right size
+ * @param [in]  stride_h      stride height
+ * @param [in]  stride_w      stride width
+ * @param [in]  ins_h         insert extra element for each i_fmap row
+ * @param [in]  ins_w         insert extra element for each i_fmap col
+ * @param [in]  ins_h_last    insert extra element for last i_fmap row
+ * @param [in]  ins_w_last    insert extra element for last i_fmap col
+ * @param [in]  input_sign    i_fmap data type. 0 => unsigned, 1 => signed
+ *
+ * @retval BM_SUCCESS               success
+ * @retval BM_ERR_INVALID_ARGUMENT  illegal ins_h/w or ins_[hw]_last
+ */
+int native_pooling_max_int8(const int8_t *i_fmap, int8_t *o_fmap, int input_n, int input_c,
+                            int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b,
+                            int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h,
+                            int ins_w, int ins_h_last, int ins_w_last, int input_sign);
+
+int native_pooling_max_fp32(const float *i_fmap, float *o_fmap, int input_n, int input_c,
+                            int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b,
+                            int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h,
+                            int ins_w, int ins_h_last, int ins_w_last);
+
+int native_pooling_avg_fp32(const float *i_fmap, float *o_fmap, int input_n, int input_c,
+                            int input_h, int input_w, int kh, int kw, int pad_h_t, int pad_h_b,
+                            int pad_w_l, int pad_w_r, int stride_h, int stride_w, int ins_h,
+                            int ins_w, int ins_h_last, int ins_w_last, float avg_pooling_const);
+
+int native_depthwise_fp32(const float *ifmap, const float *weight, const float *bias, float *ofmap,
+                          int in, int ic, int ih, int iw, int kh, int kw, int dh, int dw,
+                          int pad_h_t, int pad_h_b, int pad_w_l, int pad_w_r, int stride_h,
+                          int stride_w, int ins_h, int ins_w, int ins_h_last, int ins_w_last);
+
+/**
+ * @name    satu_2_8bit
+ * @brief   saturate each signed or unsiged 8bit element in array
+ * @ingroup libbmutils
+ *
+ * @param [in]  pBuff       input array
+ * @param [in]  len         lenght of input array
+ * @param [out] pyByteOut   output array
+ * @param [in]  rshiftbits  right shift bit if round_floor && value != 0
+ * @param [in]  round_floor enable floor rounding
+ * @param [in]  sign_unsign 0 => unsigned, 1 => signed
+ *
+ * @retval BM_SUCCESS                success
+ * @retval BM_ERR_INVALID_ARGUMENT   rshiftbits < 0
+ */
+int satu_2_8bit(const int *pBuff, int len, int8_t *pByteOut, int rshiftbits, int round_floor,
+                int sign_unsign);
+
+/**
+ * @name    satu_2_16bit
+ * @brief   saturate each signed or unsiged 16bit element in array
+ * @ingroup libbmutils
+ *
+ * @param [in]  pBuff       input array
+ * @param [in]  len         lenght of input array
+ * @param [out] pyByteOut   output array
+ * @param [in]  rshiftbits  right shift bit if round_floor && value != 0
+ * @param [in]  round_floor enable floor rounding
+ * @param [in]  sign_unsign 0 => unsigned, 1 => signed
+ *
+ * @retval BM_SUCCESS                success
+ * @retval BM_ERR_INVALID_ARGUMENT   rshiftbits < 0
+ */
+int satu_2_16bit(const int *pBuff, int len, short *pByteOut, int rshiftbits, int round_floor,
+                 int sign_unsign);
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _BM_NATIVE_REF_H_ */
diff --git a/cvimath/tests/include/test_tf_quant_util.h b/cvimath/tests/include/test_tf_quant_util.h
new file mode 100644
index 000000000..4ab497acf
--- /dev/null
+++ b/cvimath/tests/include/test_tf_quant_util.h
@@ -0,0 +1,41 @@
+#ifndef TEST_TF_QUANT_UTIL_H
+#define TEST_TF_QUANT_UTIL_H
+
+#include <stdint.h>
+
+#define MAX(a, b)           \
+  ({                        \
+    __typeof__(a) _a = (a); \
+    __typeof__(b) _b = (b); \
+    _a > _b ? _a : _b;      \
+  })
+
+#define MIN(a, b)           \
+  ({                        \
+    __typeof__(a) _a = (a); \
+    __typeof__(b) _b = (b); \
+    _a > _b ? _b : _a;      \
+  })
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int32_t RoundingDivideByPOT(int32_t x, int exponent);
+int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b);
+int32_t MultiplyByQuantizedMultiplier(int32_t x, int32_t quantized_multiplier, int rshift);
+void QuantizeMultiplierSmallerThanOne(float real_multiplier, uint32_t *quantized_multiplier,
+                                      int *right_shift);
+
+void pack_chl_quan_param(uint32_t channels, int has_bias, int32_t *bias, uint32_t *multiplier,
+                         int8_t *rshift, uint8_t *packed_data);
+
+// 1880v2: 5bit right shift, [0, 31]
+// 1822:   1bit sign, 5b shift, [-32, 31]
+int8_t truncate_rshift(int8_t rshift, int8_t allow_lshift);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif  // TEST_TF_QUANT_UTIL_H
diff --git a/cvimath/toolchain/toolchain-aarch64-linux.cmake b/cvimath/toolchain/toolchain-aarch64-linux.cmake
new file mode 100644
index 000000000..f02735d44
--- /dev/null
+++ b/cvimath/toolchain/toolchain-aarch64-linux.cmake
@@ -0,0 +1,52 @@
+include(CMakeForceCompiler)
+
+# usage
+# cmake -DCMAKE_TOOLCHAIN_FILE=../toolchain-arm-linux.cmake ../
+# The Generic system name is used for embedded targets (targets without OS) in
+# CMake
+set( CMAKE_SYSTEM_NAME          Linux )
+set( CMAKE_SYSTEM_PROCESSOR     aarch64 )
+
+# Set a toolchain path. You only need to set this if the toolchain isn't in
+# your system path. Don't forget a trailing path separator!
+set(TOOLCHAIN_TOPDIR "${TOOLCHAIN_ROOT_DIR}")
+set( TC_PATH "${TOOLCHAIN_ROOT_DIR}/bin/" )
+
+# The toolchain prefix for all toolchain executables
+set( CROSS_COMPILE aarch64-linux-gnu- )
+set( ARCH arm64 )
+
+# specify the cross compiler. We force the compiler so that CMake doesn't
+# attempt to build a simple test program as this will fail without us using
+# the -nostartfiles option on the command line
+set(CMAKE_C_COMPILER ${TC_PATH}${CROSS_COMPILE}gcc)
+set(CMAKE_CXX_COMPILER ${TC_PATH}${CROSS_COMPILE}g++)
+
+# To build the tests, we need to set where the target environment containing
+# the required library is. On Debian-like systems, this is
+# /usr/aarch64-linux-gnu.
+SET(CMAKE_FIND_ROOT_PATH $ENV{TOOLCHAIN_TOPDIR})
+# search for programs in the build host directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+# for libraries and headers in the target directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# We must set the OBJCOPY setting into cache so that it's available to the
+# whole project. Otherwise, this does not get set into the CACHE and therefore
+# the build doesn't know what the OBJCOPY filepath is
+set( CMAKE_OBJCOPY      ${TC_PATH}${CROSS_COMPILE}objcopy
+	    CACHE FILEPATH "The toolchain objcopy command " FORCE )
+
+# Set the CMAKE C flags (which should also be used by the assembler!
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Os -std=gnu11" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdata-sections" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-pointer-to-int-cast" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsigned-char" )
+
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsigned-char" )
+
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+set( CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
diff --git a/cvimath/toolchain/toolchain-gnueabihf-linux.cmake b/cvimath/toolchain/toolchain-gnueabihf-linux.cmake
new file mode 100644
index 000000000..5a606ebbd
--- /dev/null
+++ b/cvimath/toolchain/toolchain-gnueabihf-linux.cmake
@@ -0,0 +1,57 @@
+include(CMakeForceCompiler)
+
+# usage
+# cmake -DCMAKE_TOOLCHAIN_FILE=../toolchain-arm-linux.cmake ../
+# The Generic system name is used for embedded targets (targets without OS) in
+# CMake
+set( CMAKE_SYSTEM_NAME          Linux )
+set( CMAKE_SYSTEM_PROCESSOR     arm )
+
+# Set a toolchain path. You only need to set this if the toolchain isn't in
+# your system path. Don't forget a trailing path separator!
+set(TOOLCHAIN_TOPDIR "${TOOLCHAIN_ROOT_DIR}")
+set( TC_PATH "${TOOLCHAIN_ROOT_DIR}/bin/" )
+
+# The toolchain prefix for all toolchain executables
+set( CROSS_COMPILE arm-linux-gnueabihf- )
+set( ARCH arm )
+
+# specify the cross compiler. We force the compiler so that CMake doesn't
+# attempt to build a simple test program as this will fail without us using
+# the -nostartfiles option on the command line
+set(CMAKE_C_COMPILER ${TC_PATH}${CROSS_COMPILE}gcc)
+set(CMAKE_CXX_COMPILER ${TC_PATH}${CROSS_COMPILE}g++)
+
+# To build the tests, we need to set where the target environment containing
+# the required library is. On Debian-like systems, this is
+# /usr/aarch64-linux-gnu.
+SET(CMAKE_FIND_ROOT_PATH $ENV{TOOLCHAIN_TOPDIR})
+# search for programs in the build host directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)
+# for libraries and headers in the target directories
+SET(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)
+SET(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)
+
+# We must set the OBJCOPY setting into cache so that it's available to the
+# whole project. Otherwise, this does not get set into the CACHE and therefore
+# the build doesn't know what the OBJCOPY filepath is
+set( CMAKE_OBJCOPY      ${TC_PATH}${CROSS_COMPILE}objcopy
+	    CACHE FILEPATH "The toolchain objcopy command " FORCE )
+
+# Set the CMAKE C flags (which should also be used by the assembler!
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Os -std=gnu11" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -march=armv8-a" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -ffunction-sections" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fdata-sections" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Wno-pointer-to-int-cast" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsigned-char" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfloat-abi=hard" )
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -mfpu=neon-vfpv4" )
+
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsigned-char" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=armv8-a" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfloat-abi=hard" )
+set( CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mfpu=neon-vfpv4" )
+
+set( CMAKE_C_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )
+set( CMAKE_ASM_FLAGS "${CMAKE_C_FLAGS}" CACHE STRING "" )